#ifndef CUFFTDX_FFT_243_FP32_INV_PTX_HPP
#define CUFFTDX_FFT_243_FP32_INV_PTX_HPP



template<> __forceinline__ __device__ void cufftdx_private_function<334, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<1626>;
.reg .b32 r<14>;
.reg .b64 rd<9>;
mov.u32 r12, %tid.y;
mov.u32 r13, %54;
mad.lo.s32 r3, r12, 1944, r13;
add.f32 f109, %74, %92;
add.f32 f110, %56, f109;
mul.f32 f113, f109, 0f3F000000;
sub.f32 f114, %56, f113;
add.f32 f1625, %75, %93;
sub.f32 f115, %75, %93;
mul.f32 f116, f115, 0fBF5DB3D7;
add.f32 f117, f116, f114;
sub.f32 f118, f114, f116;
add.f32 f1624, %57, f1625;
mul.f32 f119, f1625, 0f3F000000;
sub.f32 f120, %57, f119;
sub.f32 f121, %74, %92;
mul.f32 f122, f121, 0fBF5DB3D7;
sub.f32 f123, f120, f122;
add.f32 f124, f122, f120;
add.f32 f125, %80, %98;
add.f32 f126, %62, f125;
mul.f32 f129, f125, 0f3F000000;
sub.f32 f130, %62, f129;
add.f32 f1623, %81, %99;
sub.f32 f131, %81, %99;
mul.f32 f132, f131, 0fBF5DB3D7;
add.f32 f133, f132, f130;
sub.f32 f134, f130, f132;
add.f32 f1622, %63, f1623;
mul.f32 f135, f1623, 0f3F000000;
sub.f32 f136, %63, f135;
sub.f32 f137, %80, %98;
mul.f32 f138, f137, 0fBF5DB3D7;
sub.f32 f139, f136, f138;
add.f32 f140, f138, f136;
add.f32 f141, %86, %104;
add.f32 f142, %68, f141;
mul.f32 f145, f141, 0f3F000000;
sub.f32 f146, %68, f145;
add.f32 f1621, %87, %105;
sub.f32 f147, %87, %105;
mul.f32 f148, f147, 0fBF5DB3D7;
add.f32 f149, f148, f146;
sub.f32 f150, f146, f148;
add.f32 f1620, %69, f1621;
mul.f32 f151, f1621, 0f3F000000;
sub.f32 f152, %69, f151;
sub.f32 f153, %86, %104;
mul.f32 f154, f153, 0fBF5DB3D7;
sub.f32 f155, f152, f154;
add.f32 f156, f154, f152;
mul.f32 f158, f139, 0f3F248DBB;
mul.f32 f1619, f133, 0f3F441B7D;
sub.f32 f159, f1619, f158;
mul.f32 f160, f139, 0f3F441B7D;
fma.rn.f32 f161, f133, 0f3F248DBB, f160;
mul.f32 f1617, f149, 0f3E31D0D4;
mul.f32 f1618, f155, 0f3F7C1C5C;
sub.f32 f164, f1617, f1618;
mul.f32 f165, f155, 0f3E31D0D4;
fma.rn.f32 f166, f149, 0f3F7C1C5C, f165;
mul.f32 f1615, f134, 0f3E31D0D4;
mul.f32 f1616, f140, 0f3F7C1C5C;
sub.f32 f169, f1615, f1616;
mul.f32 f170, f140, 0f3E31D0D4;
fma.rn.f32 f171, f134, 0f3F7C1C5C, f170;
mul.f32 f1613, f150, 0fBF708FB2;
mul.f32 f1614, f156, 0f3EAF1D44;
sub.f32 f174, f1613, f1614;
mul.f32 f175, f156, 0fBF708FB2;
fma.rn.f32 f176, f150, 0f3EAF1D44, f175;
add.f32 f177, f126, f142;
add.f32 f178, f110, f177;
mul.f32 f181, f177, 0f3F000000;
sub.f32 f182, f110, f181;
add.f32 f1612, f1622, f1620;
sub.f32 f183, f1622, f1620;
mul.f32 f184, f183, 0fBF5DB3D7;
add.f32 f185, f184, f182;
sub.f32 f186, f182, f184;
add.f32 f1611, f1624, f1612;
mul.f32 f187, f1612, 0f3F000000;
sub.f32 f188, f1624, f187;
sub.f32 f189, f126, f142;
mul.f32 f190, f189, 0fBF5DB3D7;
sub.f32 f191, f188, f190;
add.f32 f192, f190, f188;
add.f32 f193, f159, f164;
add.f32 f194, f117, f193;
mul.f32 f197, f193, 0f3F000000;
sub.f32 f198, f117, f197;
add.f32 f1610, f161, f166;
sub.f32 f199, f161, f166;
mul.f32 f200, f199, 0fBF5DB3D7;
add.f32 f201, f200, f198;
sub.f32 f202, f198, f200;
add.f32 f1609, f123, f1610;
mul.f32 f203, f1610, 0f3F000000;
sub.f32 f204, f123, f203;
sub.f32 f205, f159, f164;
mul.f32 f206, f205, 0fBF5DB3D7;
sub.f32 f207, f204, f206;
add.f32 f208, f206, f204;
add.f32 f209, f169, f174;
add.f32 f210, f118, f209;
mul.f32 f213, f209, 0f3F000000;
sub.f32 f214, f118, f213;
add.f32 f1608, f171, f176;
sub.f32 f215, f171, f176;
mul.f32 f216, f215, 0fBF5DB3D7;
add.f32 f217, f216, f214;
sub.f32 f218, f214, f216;
add.f32 f1607, f124, f1608;
mul.f32 f219, f1608, 0f3F000000;
sub.f32 f220, f124, f219;
sub.f32 f221, f169, f174;
mul.f32 f222, f221, 0fBF5DB3D7;
sub.f32 f223, f220, f222;
add.f32 f224, f222, f220;
add.f32 f225, %76, %94;
add.f32 f226, %58, f225;
mul.f32 f229, f225, 0f3F000000;
sub.f32 f230, %58, f229;
add.f32 f1604, %111, %110;
sub.f32 f231, %111, %110;
mul.f32 f232, f231, 0fBF5DB3D7;
add.f32 f233, f232, f230;
sub.f32 f234, f230, f232;
add.f32 f1602, %112, f1604;
mul.f32 f235, f1604, 0f3F000000;
sub.f32 f236, %112, f235;
sub.f32 f237, %76, %94;
mul.f32 f238, f237, 0fBF5DB3D7;
sub.f32 f239, f236, f238;
add.f32 f240, f238, f236;
add.f32 f241, %82, %100;
add.f32 f242, %64, f241;
mul.f32 f245, f241, 0f3F000000;
sub.f32 f246, %64, f245;
add.f32 f1599, %114, %113;
sub.f32 f247, %114, %113;
mul.f32 f248, f247, 0fBF5DB3D7;
add.f32 f249, f248, f246;
sub.f32 f250, f246, f248;
add.f32 f1597, %115, f1599;
mul.f32 f251, f1599, 0f3F000000;
sub.f32 f252, %115, f251;
sub.f32 f253, %82, %100;
mul.f32 f254, f253, 0fBF5DB3D7;
sub.f32 f255, f252, f254;
add.f32 f256, f254, f252;
add.f32 f257, %88, %106;
add.f32 f258, %70, f257;
mul.f32 f261, f257, 0f3F000000;
sub.f32 f262, %70, f261;
add.f32 f1594, %116, %117;
sub.f32 f263, %116, %117;
mul.f32 f264, f263, 0fBF5DB3D7;
add.f32 f265, f264, f262;
sub.f32 f266, f262, f264;
add.f32 f1592, %118, f1594;
mul.f32 f267, f1594, 0f3F000000;
sub.f32 f268, %118, f267;
sub.f32 f269, %88, %106;
mul.f32 f270, f269, 0fBF5DB3D7;
sub.f32 f271, f268, f270;
add.f32 f272, f270, f268;
mul.f32 f274, f255, 0f3F248DBB;
mul.f32 f1591, f249, 0f3F441B7D;
sub.f32 f275, f1591, f274;
mul.f32 f276, f255, 0f3F441B7D;
fma.rn.f32 f277, f249, 0f3F248DBB, f276;
mul.f32 f279, f271, 0f3F7C1C5C;
mul.f32 f1590, f265, 0f3E31D0D4;
sub.f32 f280, f1590, f279;
mul.f32 f281, f271, 0f3E31D0D4;
fma.rn.f32 f282, f265, 0f3F7C1C5C, f281;
mul.f32 f1588, f250, 0f3E31D0D4;
mul.f32 f1589, f256, 0f3F7C1C5C;
sub.f32 f285, f1588, f1589;
mul.f32 f286, f256, 0f3E31D0D4;
fma.rn.f32 f287, f250, 0f3F7C1C5C, f286;
mul.f32 f1586, f266, 0fBF708FB2;
mul.f32 f1587, f272, 0f3EAF1D44;
sub.f32 f290, f1586, f1587;
mul.f32 f291, f272, 0fBF708FB2;
fma.rn.f32 f292, f266, 0f3EAF1D44, f291;
add.f32 f293, f242, f258;
add.f32 f294, f226, f293;
mul.f32 f297, f293, 0f3F000000;
sub.f32 f298, f226, f297;
add.f32 f1585, f1597, f1592;
sub.f32 f299, f1597, f1592;
mul.f32 f300, f299, 0fBF5DB3D7;
add.f32 f301, f300, f298;
sub.f32 f302, f298, f300;
add.f32 f1584, f1602, f1585;
mul.f32 f303, f1585, 0f3F000000;
sub.f32 f304, f1602, f303;
sub.f32 f305, f242, f258;
mul.f32 f306, f305, 0fBF5DB3D7;
sub.f32 f307, f304, f306;
add.f32 f308, f306, f304;
add.f32 f309, f275, f280;
add.f32 f310, f233, f309;
mul.f32 f313, f309, 0f3F000000;
sub.f32 f314, f233, f313;
add.f32 f1583, f277, f282;
sub.f32 f315, f277, f282;
mul.f32 f316, f315, 0fBF5DB3D7;
add.f32 f317, f316, f314;
sub.f32 f318, f314, f316;
add.f32 f1582, f239, f1583;
mul.f32 f319, f1583, 0f3F000000;
sub.f32 f320, f239, f319;
sub.f32 f321, f275, f280;
mul.f32 f322, f321, 0fBF5DB3D7;
sub.f32 f323, f320, f322;
add.f32 f324, f322, f320;
add.f32 f325, f285, f290;
add.f32 f326, f234, f325;
mul.f32 f329, f325, 0f3F000000;
sub.f32 f330, f234, f329;
add.f32 f1581, f287, f292;
sub.f32 f331, f287, f292;
mul.f32 f332, f331, 0fBF5DB3D7;
add.f32 f333, f332, f330;
sub.f32 f334, f330, f332;
add.f32 f1580, f240, f1581;
mul.f32 f335, f1581, 0f3F000000;
sub.f32 f336, f240, f335;
sub.f32 f337, f285, f290;
mul.f32 f338, f337, 0fBF5DB3D7;
sub.f32 f339, f336, f338;
add.f32 f340, f338, f336;
add.f32 f341, %78, %96;
add.f32 f342, %60, f341;
mul.f32 f345, f341, 0f3F000000;
sub.f32 f346, %60, f345;
add.f32 f1577, %119, %120;
sub.f32 f347, %119, %120;
mul.f32 f348, f347, 0fBF5DB3D7;
add.f32 f349, f348, f346;
sub.f32 f350, f346, f348;
add.f32 f1575, %121, f1577;
mul.f32 f351, f1577, 0f3F000000;
sub.f32 f352, %121, f351;
sub.f32 f353, %78, %96;
mul.f32 f354, f353, 0fBF5DB3D7;
sub.f32 f355, f352, f354;
add.f32 f356, f354, f352;
add.f32 f357, %84, %102;
add.f32 f358, %66, f357;
mul.f32 f361, f357, 0f3F000000;
sub.f32 f362, %66, f361;
add.f32 f1572, %123, %122;
sub.f32 f363, %123, %122;
mul.f32 f364, f363, 0fBF5DB3D7;
add.f32 f365, f364, f362;
sub.f32 f366, f362, f364;
add.f32 f1570, %124, f1572;
mul.f32 f367, f1572, 0f3F000000;
sub.f32 f368, %124, f367;
sub.f32 f369, %84, %102;
mul.f32 f370, f369, 0fBF5DB3D7;
sub.f32 f371, f368, f370;
add.f32 f372, f370, f368;
add.f32 f373, %90, %108;
add.f32 f374, %72, f373;
mul.f32 f377, f373, 0f3F000000;
sub.f32 f378, %72, f377;
add.f32 f1568, %125, %109;
sub.f32 f379, %125, %109;
mul.f32 f380, f379, 0fBF5DB3D7;
add.f32 f381, f380, f378;
sub.f32 f382, f378, f380;
add.f32 f1566, %126, f1568;
mul.f32 f383, f1568, 0f3F000000;
sub.f32 f384, %126, f383;
sub.f32 f385, %90, %108;
mul.f32 f386, f385, 0fBF5DB3D7;
sub.f32 f387, f384, f386;
add.f32 f388, f386, f384;
mul.f32 f390, f371, 0f3F248DBB;
mul.f32 f1565, f365, 0f3F441B7D;
sub.f32 f391, f1565, f390;
mul.f32 f392, f371, 0f3F441B7D;
fma.rn.f32 f393, f365, 0f3F248DBB, f392;
mul.f32 f395, f387, 0f3F7C1C5C;
mul.f32 f1564, f381, 0f3E31D0D4;
sub.f32 f396, f1564, f395;
mul.f32 f397, f387, 0f3E31D0D4;
fma.rn.f32 f398, f381, 0f3F7C1C5C, f397;
mul.f32 f1562, f366, 0f3E31D0D4;
mul.f32 f1563, f372, 0f3F7C1C5C;
sub.f32 f401, f1562, f1563;
mul.f32 f402, f372, 0f3E31D0D4;
fma.rn.f32 f403, f366, 0f3F7C1C5C, f402;
mul.f32 f1560, f382, 0fBF708FB2;
mul.f32 f1561, f388, 0f3EAF1D44;
sub.f32 f406, f1560, f1561;
mul.f32 f407, f388, 0fBF708FB2;
fma.rn.f32 f408, f382, 0f3EAF1D44, f407;
add.f32 f409, f358, f374;
add.f32 f410, f342, f409;
mul.f32 f413, f409, 0f3F000000;
sub.f32 f414, f342, f413;
add.f32 f1559, f1570, f1566;
sub.f32 f415, f1570, f1566;
mul.f32 f416, f415, 0fBF5DB3D7;
add.f32 f417, f416, f414;
sub.f32 f418, f414, f416;
add.f32 f1558, f1575, f1559;
mul.f32 f419, f1559, 0f3F000000;
sub.f32 f420, f1575, f419;
sub.f32 f421, f358, f374;
mul.f32 f422, f421, 0fBF5DB3D7;
sub.f32 f423, f420, f422;
add.f32 f424, f422, f420;
add.f32 f425, f391, f396;
add.f32 f426, f349, f425;
mul.f32 f429, f425, 0f3F000000;
sub.f32 f430, f349, f429;
add.f32 f1557, f393, f398;
sub.f32 f431, f393, f398;
mul.f32 f432, f431, 0fBF5DB3D7;
add.f32 f433, f432, f430;
sub.f32 f434, f430, f432;
add.f32 f1556, f355, f1557;
mul.f32 f435, f1557, 0f3F000000;
sub.f32 f436, f355, f435;
sub.f32 f437, f391, f396;
mul.f32 f438, f437, 0fBF5DB3D7;
sub.f32 f439, f436, f438;
add.f32 f440, f438, f436;
add.f32 f441, f401, f406;
add.f32 f442, f350, f441;
mul.f32 f445, f441, 0f3F000000;
sub.f32 f446, f350, f445;
add.f32 f1555, f403, f408;
sub.f32 f447, f403, f408;
mul.f32 f448, f447, 0fBF5DB3D7;
add.f32 f449, f448, f446;
sub.f32 f450, f446, f448;
add.f32 f1554, f356, f1555;
mul.f32 f451, f1555, 0f3F000000;
sub.f32 f452, f356, f451;
sub.f32 f453, f401, f406;
mul.f32 f454, f453, 0fBF5DB3D7;
sub.f32 f455, f452, f454;
add.f32 f456, f454, f452;
mul.f32 f458, f1582, 0f3E6C2691;
mul.f32 f1553, f310, 0f3F791978;
sub.f32 f459, f1553, f458;
mul.f32 f460, f1582, 0f3F791978;
fma.rn.f32 f461, f310, 0f3E6C2691, f460;
mul.f32 f1551, f426, 0f3F64C51C;
mul.f32 f1552, f1556, 0f3EE5C902;
sub.f32 f464, f1551, f1552;
mul.f32 f465, f1556, 0f3F64C51C;
fma.rn.f32 f466, f426, 0f3EE5C902, f465;
mul.f32 f1549, f326, 0f3F64C51C;
mul.f32 f1550, f1580, 0f3EE5C902;
sub.f32 f469, f1549, f1550;
mul.f32 f470, f1580, 0f3F64C51C;
fma.rn.f32 f471, f326, 0f3EE5C902, f470;
mul.f32 f1547, f442, 0f3F18DF63;
mul.f32 f1548, f1554, 0f3F4D57F2;
sub.f32 f474, f1547, f1548;
mul.f32 f475, f1554, 0f3F18DF63;
fma.rn.f32 f476, f442, 0f3F4D57F2, f475;
mul.f32 f1545, f301, 0f3F441B7D;
mul.f32 f1546, f307, 0f3F248DBB;
sub.f32 f479, f1545, f1546;
mul.f32 f480, f307, 0f3F441B7D;
fma.rn.f32 f481, f301, 0f3F248DBB, f480;
mul.f32 f483, f423, 0f3F7C1C5C;
mul.f32 f1544, f417, 0f3E31D0D4;
sub.f32 f484, f1544, f483;
mul.f32 f485, f423, 0f3E31D0D4;
fma.rn.f32 f486, f417, 0f3F7C1C5C, f485;
mul.f32 f488, f323, 0f3F4D57F2;
mul.f32 f1543, f317, 0f3F18DF63;
sub.f32 f489, f1543, f488;
mul.f32 f490, f323, 0f3F18DF63;
fma.rn.f32 f491, f317, 0f3F4D57F2, f490;
mul.f32 f493, f439, 0f3F753ECD;
mul.f32 f1542, f433, 0fBE92D7E0;
sub.f32 f494, f1542, f493;
mul.f32 f495, f439, 0fBE92D7E0;
fma.rn.f32 f496, f433, 0f3F753ECD, f495;
mul.f32 f498, f339, 0f3F6B1036;
mul.f32 f1541, f333, 0f3ECACAF8;
sub.f32 f499, f1541, f498;
mul.f32 f500, f339, 0f3ECACAF8;
fma.rn.f32 f501, f333, 0f3F6B1036, f500;
mul.f32 f503, f455, 0f3F3A3529;
mul.f32 f1540, f449, 0fBF2FAD88;
sub.f32 f504, f1540, f503;
mul.f32 f505, f455, 0fBF2FAD88;
fma.rn.f32 f506, f449, 0f3F3A3529, f505;
mul.f32 f508, f308, 0f3F7C1C5C;
mul.f32 f1539, f302, 0f3E31D0D4;
sub.f32 f509, f1539, f508;
mul.f32 f510, f308, 0f3E31D0D4;
fma.rn.f32 f511, f302, 0f3F7C1C5C, f510;
mul.f32 f1537, f418, 0fBF708FB2;
mul.f32 f1538, f424, 0f3EAF1D44;
sub.f32 f514, f1537, f1538;
mul.f32 f515, f424, 0fBF708FB2;
fma.rn.f32 f516, f418, 0f3EAF1D44, f515;
mul.f32 f1535, f318, 0fBD6E2946;
mul.f32 f1536, f324, 0f3F7F9120;
sub.f32 f519, f1535, f1536;
mul.f32 f520, f324, 0fBD6E2946;
fma.rn.f32 f521, f318, 0f3F7F9120, f520;
mul.f32 f1533, f434, 0fBF7E44DE;
mul.f32 f1534, f440, 0fBDEDC21F;
sub.f32 f524, f1533, f1534;
mul.f32 f525, f440, 0fBF7E44DE;
fma.rn.f32 f526, f434, 0fBDEDC21F, f525;
mul.f32 f528, f340, 0f3F753ECD;
mul.f32 f1532, f334, 0fBE92D7E0;
sub.f32 f529, f1532, f528;
mul.f32 f530, f340, 0fBE92D7E0;
fma.rn.f32 f531, f334, 0f3F753ECD, f530;
mul.f32 f533, f456, 0fBF0CAC9F;
mul.f32 f1531, f450, 0fBF55E287;
sub.f32 f534, f1531, f533;
mul.f32 f535, f456, 0fBF55E287;
fma.rn.f32 f536, f450, 0fBF0CAC9F, f535;
add.f32 f537, f294, f410;
mul.f32 f539, f537, 0f3F000000;
sub.f32 f540, f178, f539;
add.f32 f1530, f1584, f1558;
sub.f32 f541, f1584, f1558;
mul.f32 f542, f541, 0fBF5DB3D7;
add.f32 f543, f542, f540;
sub.f32 f544, f540, f542;
mul.f32 f545, f1530, 0f3F000000;
sub.f32 f546, f1611, f545;
sub.f32 f547, f294, f410;
mul.f32 f548, f547, 0fBF5DB3D7;
sub.f32 f549, f546, f548;
add.f32 f550, f548, f546;
add.f32 f551, f459, f464;
add.f32 f552, f194, f551;
mul.f32 f555, f551, 0f3F000000;
sub.f32 f556, f194, f555;
add.f32 f1529, f461, f466;
sub.f32 f557, f461, f466;
mul.f32 f558, f557, 0fBF5DB3D7;
add.f32 f559, f558, f556;
sub.f32 f560, f556, f558;
add.f32 f1528, f1609, f1529;
mul.f32 f561, f1529, 0f3F000000;
sub.f32 f562, f1609, f561;
sub.f32 f563, f459, f464;
mul.f32 f564, f563, 0fBF5DB3D7;
sub.f32 f565, f562, f564;
add.f32 f566, f564, f562;
add.f32 f567, f469, f474;
add.f32 f568, f210, f567;
mul.f32 f571, f567, 0f3F000000;
sub.f32 f572, f210, f571;
add.f32 f1527, f471, f476;
sub.f32 f573, f471, f476;
mul.f32 f574, f573, 0fBF5DB3D7;
add.f32 f575, f574, f572;
sub.f32 f576, f572, f574;
add.f32 f1526, f1607, f1527;
mul.f32 f577, f1527, 0f3F000000;
sub.f32 f578, f1607, f577;
sub.f32 f579, f469, f474;
mul.f32 f580, f579, 0fBF5DB3D7;
sub.f32 f581, f578, f580;
add.f32 f582, f580, f578;
add.f32 f583, f479, f484;
add.f32 f584, f185, f583;
mul.f32 f587, f583, 0f3F000000;
sub.f32 f588, f185, f587;
add.f32 f1525, f481, f486;
sub.f32 f589, f481, f486;
mul.f32 f590, f589, 0fBF5DB3D7;
add.f32 f591, f590, f588;
sub.f32 f592, f588, f590;
add.f32 f1524, f191, f1525;
mul.f32 f593, f1525, 0f3F000000;
sub.f32 f594, f191, f593;
sub.f32 f595, f479, f484;
mul.f32 f596, f595, 0fBF5DB3D7;
sub.f32 f597, f594, f596;
add.f32 f598, f596, f594;
add.f32 f599, f489, f494;
add.f32 f600, f201, f599;
mul.f32 f603, f599, 0f3F000000;
sub.f32 f604, f201, f603;
add.f32 f1523, f491, f496;
sub.f32 f605, f491, f496;
mul.f32 f606, f605, 0fBF5DB3D7;
add.f32 f607, f606, f604;
sub.f32 f608, f604, f606;
add.f32 f1522, f207, f1523;
mul.f32 f609, f1523, 0f3F000000;
sub.f32 f610, f207, f609;
sub.f32 f611, f489, f494;
mul.f32 f612, f611, 0fBF5DB3D7;
sub.f32 f613, f610, f612;
add.f32 f614, f612, f610;
add.f32 f615, f499, f504;
add.f32 f616, f217, f615;
mul.f32 f619, f615, 0f3F000000;
sub.f32 f620, f217, f619;
add.f32 f1521, f501, f506;
sub.f32 f621, f501, f506;
mul.f32 f622, f621, 0fBF5DB3D7;
add.f32 f623, f622, f620;
sub.f32 f624, f620, f622;
add.f32 f1520, f223, f1521;
mul.f32 f625, f1521, 0f3F000000;
sub.f32 f626, f223, f625;
sub.f32 f627, f499, f504;
mul.f32 f628, f627, 0fBF5DB3D7;
sub.f32 f629, f626, f628;
add.f32 f630, f628, f626;
add.f32 f631, f509, f514;
add.f32 f632, f186, f631;
mul.f32 f635, f631, 0f3F000000;
sub.f32 f636, f186, f635;
add.f32 f1519, f511, f516;
sub.f32 f637, f511, f516;
mul.f32 f638, f637, 0fBF5DB3D7;
add.f32 f639, f638, f636;
sub.f32 f640, f636, f638;
add.f32 f1518, f192, f1519;
mul.f32 f641, f1519, 0f3F000000;
sub.f32 f642, f192, f641;
sub.f32 f643, f509, f514;
mul.f32 f644, f643, 0fBF5DB3D7;
sub.f32 f645, f642, f644;
add.f32 f646, f644, f642;
add.f32 f647, f519, f524;
add.f32 f648, f202, f647;
mul.f32 f651, f647, 0f3F000000;
sub.f32 f652, f202, f651;
add.f32 f1517, f521, f526;
sub.f32 f653, f521, f526;
mul.f32 f654, f653, 0fBF5DB3D7;
add.f32 f655, f654, f652;
sub.f32 f656, f652, f654;
add.f32 f1516, f208, f1517;
mul.f32 f657, f1517, 0f3F000000;
sub.f32 f658, f208, f657;
sub.f32 f659, f519, f524;
mul.f32 f660, f659, 0fBF5DB3D7;
sub.f32 f661, f658, f660;
add.f32 f662, f660, f658;
add.f32 f663, f529, f534;
add.f32 f664, f218, f663;
mul.f32 f667, f663, 0f3F000000;
sub.f32 f668, f218, f667;
add.f32 f1515, f531, f536;
sub.f32 f669, f531, f536;
mul.f32 f670, f669, 0fBF5DB3D7;
add.f32 f671, f670, f668;
sub.f32 f672, f668, f670;
add.f32 f1514, f224, f1515;
mul.f32 f673, f1515, 0f3F000000;
sub.f32 f674, f224, f673;
sub.f32 f675, f529, f534;
mul.f32 f676, f675, 0fBF5DB3D7;
sub.f32 f677, f674, f676;
add.f32 f678, f676, f674;
mov.u32 r11, %tid.x;
mul.wide.u32 rd2, r11, 954437177;
shr.u64 rd3, rd2, 33;
cvt.u32.u64 r5, rd3;
mul.lo.s32 r6, r5, 9;
sub.s32 r7, r11, r6;
mad.lo.s32 r8, r5, 1944, r3;
mul.wide.u32 rd7, r7, 8;
mov.u64 rd8, %55;
add.s64 rd6, rd8, rd7;
ld.global.v2.f32 {f679, f680}, [rd6];
mul.f32 f683, f1528, f680;
mul.f32 f685, f679, f1528;
mul.f32 f1512, f679, f679;
mul.f32 f1513, f680, f680;
sub.f32 f688, f1512, f1513;
mul.f32 f689, f680, f679;
fma.rn.f32 f690, f680, f679, f689;
mul.f32 f691, f1526, f690;
mul.f32 f693, f688, f1526;
mul.f32 f695, f680, f690;
mul.f32 f1511, f679, f688;
sub.f32 f696, f1511, f695;
mul.f32 f1510, f568, f690;
mul.f32 f697, f679, f690;
fma.rn.f32 f698, f680, f688, f697;
mul.f32 f699, f1524, f698;
mul.f32 f701, f696, f1524;
mul.f32 f1508, f679, f696;
mul.f32 f1509, f680, f698;
sub.f32 f704, f1508, f1509;
mul.f32 f1507, f584, f698;
mul.f32 f705, f679, f698;
fma.rn.f32 f706, f680, f696, f705;
mul.f32 f707, f1522, f706;
mul.f32 f709, f704, f1522;
mul.f32 f711, f680, f706;
mul.f32 f1506, f679, f704;
sub.f32 f712, f1506, f711;
mul.f32 f1505, f600, f706;
mul.f32 f713, f679, f706;
fma.rn.f32 f714, f680, f704, f713;
mul.f32 f715, f1520, f714;
mul.f32 f717, f712, f1520;
mul.f32 f719, f680, f714;
mul.f32 f1504, f679, f712;
sub.f32 f720, f1504, f719;
mul.f32 f1503, f616, f714;
mul.f32 f721, f679, f714;
fma.rn.f32 f722, f680, f712, f721;
mul.f32 f723, f1518, f722;
mul.f32 f725, f720, f1518;
mul.f32 f1501, f679, f720;
mul.f32 f1502, f680, f722;
sub.f32 f728, f1501, f1502;
mul.f32 f1500, f632, f722;
mul.f32 f729, f679, f722;
fma.rn.f32 f730, f680, f720, f729;
mul.f32 f731, f1516, f730;
mul.f32 f733, f728, f1516;
mul.f32 f735, f680, f730;
mul.f32 f1499, f679, f728;
sub.f32 f736, f1499, f735;
mul.f32 f1498, f648, f730;
mul.f32 f737, f679, f730;
fma.rn.f32 f738, f680, f728, f737;
mul.f32 f739, f1514, f738;
mul.f32 f741, f736, f1514;
mul.f32 f743, f680, f738;
mul.f32 f1497, f679, f736;
sub.f32 f744, f1497, f743;
mul.f32 f1496, f664, f738;
mul.f32 f745, f679, f738;
fma.rn.f32 f746, f680, f736, f745;
mul.f32 f747, f549, f746;
mul.f32 f749, f744, f549;
mul.f32 f1494, f679, f744;
mul.f32 f1495, f680, f746;
sub.f32 f752, f1494, f1495;
mul.f32 f1493, f543, f746;
mul.f32 f753, f679, f746;
fma.rn.f32 f754, f680, f744, f753;
mul.f32 f755, f565, f754;
mul.f32 f757, f752, f565;
mul.f32 f759, f680, f754;
mul.f32 f1492, f679, f752;
sub.f32 f760, f1492, f759;
mul.f32 f1491, f559, f754;
mul.f32 f761, f679, f754;
fma.rn.f32 f762, f680, f752, f761;
mul.f32 f763, f581, f762;
mul.f32 f765, f760, f581;
mul.f32 f1489, f679, f760;
mul.f32 f1490, f680, f762;
sub.f32 f768, f1489, f1490;
mul.f32 f1488, f575, f762;
mul.f32 f769, f679, f762;
fma.rn.f32 f770, f680, f760, f769;
mul.f32 f771, f597, f770;
mul.f32 f773, f768, f597;
mul.f32 f775, f680, f770;
mul.f32 f1487, f679, f768;
sub.f32 f776, f1487, f775;
mul.f32 f1486, f591, f770;
mul.f32 f777, f679, f770;
fma.rn.f32 f778, f680, f768, f777;
mul.f32 f779, f613, f778;
mul.f32 f781, f776, f613;
mul.f32 f783, f680, f778;
mul.f32 f1485, f679, f776;
sub.f32 f784, f1485, f783;
mul.f32 f1484, f607, f778;
mul.f32 f785, f679, f778;
fma.rn.f32 f786, f680, f776, f785;
mul.f32 f787, f629, f786;
mul.f32 f789, f784, f629;
mul.f32 f1482, f679, f784;
mul.f32 f1483, f680, f786;
sub.f32 f792, f1482, f1483;
mul.f32 f1481, f623, f786;
mul.f32 f793, f679, f786;
fma.rn.f32 f794, f680, f784, f793;
mul.f32 f795, f645, f794;
mul.f32 f797, f792, f645;
mul.f32 f799, f680, f794;
mul.f32 f1480, f679, f792;
sub.f32 f800, f1480, f799;
mul.f32 f1479, f639, f794;
mul.f32 f801, f679, f794;
fma.rn.f32 f802, f680, f792, f801;
mul.f32 f803, f661, f802;
mul.f32 f805, f800, f661;
mul.f32 f807, f680, f802;
mul.f32 f1478, f679, f800;
sub.f32 f808, f1478, f807;
mul.f32 f1477, f655, f802;
mul.f32 f809, f679, f802;
fma.rn.f32 f810, f680, f800, f809;
mul.f32 f811, f677, f810;
mul.f32 f813, f808, f677;
mul.f32 f1475, f679, f808;
mul.f32 f1476, f680, f810;
sub.f32 f816, f1475, f1476;
mul.f32 f1474, f671, f810;
mul.f32 f817, f679, f810;
fma.rn.f32 f818, f680, f808, f817;
mul.f32 f819, f550, f818;
mul.f32 f821, f816, f550;
mul.f32 f823, f680, f818;
mul.f32 f1473, f679, f816;
sub.f32 f824, f1473, f823;
mul.f32 f1472, f544, f818;
mul.f32 f825, f679, f818;
fma.rn.f32 f826, f680, f816, f825;
mul.f32 f827, f566, f826;
mul.f32 f829, f824, f566;
mul.f32 f1470, f679, f824;
mul.f32 f1471, f680, f826;
sub.f32 f832, f1470, f1471;
mul.f32 f1469, f560, f826;
mul.f32 f833, f679, f826;
fma.rn.f32 f834, f680, f824, f833;
mul.f32 f835, f582, f834;
mul.f32 f837, f832, f582;
mul.f32 f839, f680, f834;
mul.f32 f1468, f679, f832;
sub.f32 f840, f1468, f839;
mul.f32 f1467, f576, f834;
mul.f32 f841, f679, f834;
fma.rn.f32 f842, f680, f832, f841;
mul.f32 f843, f598, f842;
mul.f32 f845, f840, f598;
mul.f32 f847, f680, f842;
mul.f32 f1466, f679, f840;
sub.f32 f848, f1466, f847;
mul.f32 f1465, f592, f842;
mul.f32 f849, f679, f842;
fma.rn.f32 f850, f680, f840, f849;
mul.f32 f851, f614, f850;
mul.f32 f853, f848, f614;
mul.f32 f1463, f679, f848;
mul.f32 f1464, f680, f850;
sub.f32 f856, f1463, f1464;
mul.f32 f1462, f608, f850;
mul.f32 f857, f679, f850;
fma.rn.f32 f858, f680, f848, f857;
mul.f32 f859, f630, f858;
mul.f32 f861, f856, f630;
mul.f32 f863, f680, f858;
mul.f32 f1461, f679, f856;
sub.f32 f864, f1461, f863;
mul.f32 f1460, f624, f858;
mul.f32 f865, f679, f858;
fma.rn.f32 f866, f680, f856, f865;
mul.f32 f867, f646, f866;
mul.f32 f869, f864, f646;
mul.f32 f871, f680, f866;
mul.f32 f1459, f679, f864;
sub.f32 f872, f1459, f871;
mul.f32 f1458, f640, f866;
mul.f32 f873, f679, f866;
fma.rn.f32 f874, f680, f864, f873;
mul.f32 f875, f662, f874;
mul.f32 f877, f872, f662;
mul.f32 f1456, f679, f872;
mul.f32 f1457, f680, f874;
sub.f32 f880, f1456, f1457;
mul.f32 f1455, f656, f874;
mul.f32 f881, f679, f874;
mul.f32 f1454, f552, f680;
fma.rn.f32 f882, f680, f872, f881;
mul.f32 f883, f678, f882;
mul.f32 f884, f672, f882;
mul.f32 f885, f880, f678;
barrier.sync 0;
mad.lo.s32 r9, r7, 216, r8;
add.f32 f886, f1611, f1530;
add.f32 f887, f178, f537;
st.shared.v2.f32 [r9], {f887, f886};
fma.rn.f32 f888, f679, f552, f683;
sub.f32 f889, f685, f1454;
st.shared.v2.f32 [r9+8], {f888, f889};
fma.rn.f32 f890, f688, f568, f691;
sub.f32 f891, f693, f1510;
st.shared.v2.f32 [r9+16], {f890, f891};
fma.rn.f32 f892, f696, f584, f699;
sub.f32 f893, f701, f1507;
st.shared.v2.f32 [r9+24], {f892, f893};
fma.rn.f32 f894, f704, f600, f707;
sub.f32 f895, f709, f1505;
st.shared.v2.f32 [r9+32], {f894, f895};
fma.rn.f32 f896, f712, f616, f715;
sub.f32 f897, f717, f1503;
st.shared.v2.f32 [r9+40], {f896, f897};
fma.rn.f32 f898, f720, f632, f723;
sub.f32 f899, f725, f1500;
st.shared.v2.f32 [r9+48], {f898, f899};
sub.f32 f900, f733, f1498;
fma.rn.f32 f901, f728, f648, f731;
st.shared.v2.f32 [r9+56], {f901, f900};
fma.rn.f32 f902, f736, f664, f739;
sub.f32 f903, f741, f1496;
st.shared.v2.f32 [r9+64], {f902, f903};
fma.rn.f32 f904, f744, f543, f747;
sub.f32 f905, f749, f1493;
st.shared.v2.f32 [r9+72], {f904, f905};
fma.rn.f32 f906, f752, f559, f755;
sub.f32 f907, f757, f1491;
st.shared.v2.f32 [r9+80], {f906, f907};
fma.rn.f32 f908, f760, f575, f763;
sub.f32 f909, f765, f1488;
st.shared.v2.f32 [r9+88], {f908, f909};
fma.rn.f32 f910, f768, f591, f771;
sub.f32 f911, f773, f1486;
st.shared.v2.f32 [r9+96], {f910, f911};
fma.rn.f32 f912, f776, f607, f779;
sub.f32 f913, f781, f1484;
st.shared.v2.f32 [r9+104], {f912, f913};
fma.rn.f32 f914, f784, f623, f787;
sub.f32 f915, f789, f1481;
st.shared.v2.f32 [r9+112], {f914, f915};
fma.rn.f32 f916, f792, f639, f795;
sub.f32 f917, f797, f1479;
st.shared.v2.f32 [r9+120], {f916, f917};
fma.rn.f32 f918, f800, f655, f803;
sub.f32 f919, f805, f1477;
st.shared.v2.f32 [r9+128], {f918, f919};
fma.rn.f32 f920, f808, f671, f811;
sub.f32 f921, f813, f1474;
st.shared.v2.f32 [r9+136], {f920, f921};
fma.rn.f32 f922, f816, f544, f819;
sub.f32 f923, f821, f1472;
st.shared.v2.f32 [r9+144], {f922, f923};
fma.rn.f32 f924, f824, f560, f827;
sub.f32 f925, f829, f1469;
st.shared.v2.f32 [r9+152], {f924, f925};
fma.rn.f32 f926, f832, f576, f835;
sub.f32 f927, f837, f1467;
st.shared.v2.f32 [r9+160], {f926, f927};
fma.rn.f32 f928, f840, f592, f843;
sub.f32 f929, f845, f1465;
st.shared.v2.f32 [r9+168], {f928, f929};
fma.rn.f32 f930, f848, f608, f851;
sub.f32 f931, f853, f1462;
st.shared.v2.f32 [r9+176], {f930, f931};
fma.rn.f32 f932, f856, f624, f859;
sub.f32 f933, f861, f1460;
st.shared.v2.f32 [r9+184], {f932, f933};
fma.rn.f32 f934, f864, f640, f867;
sub.f32 f935, f869, f1458;
st.shared.v2.f32 [r9+192], {f934, f935};
fma.rn.f32 f936, f872, f656, f875;
sub.f32 f937, f877, f1455;
st.shared.v2.f32 [r9+200], {f936, f937};
fma.rn.f32 f938, f880, f672, f883;
sub.f32 f939, f885, f884;
st.shared.v2.f32 [r9+208], {f938, f939};
barrier.sync 0;
mad.lo.s32 r10, r7, -208, r9;
ld.shared.v2.f32 {f940, f941}, [r10];
ld.shared.v2.f32 {f944, f945}, [r10+72];
ld.shared.v2.f32 {f948, f949}, [r10+144];
ld.shared.v2.f32 {f952, f953}, [r10+216];
ld.shared.v2.f32 {f956, f957}, [r10+288];
ld.shared.v2.f32 {f960, f961}, [r10+360];
ld.shared.v2.f32 {f964, f965}, [r10+432];
ld.shared.v2.f32 {f968, f969}, [r10+504];
ld.shared.v2.f32 {f972, f973}, [r10+576];
ld.shared.v2.f32 {f976, f977}, [r10+648];
ld.shared.v2.f32 {f980, f981}, [r10+720];
ld.shared.v2.f32 {f984, f985}, [r10+792];
ld.shared.v2.f32 {f988, f989}, [r10+864];
ld.shared.v2.f32 {f992, f993}, [r10+936];
ld.shared.v2.f32 {f996, f997}, [r10+1008];
ld.shared.v2.f32 {f1000, f1001}, [r10+1080];
ld.shared.v2.f32 {f1004, f1005}, [r10+1152];
ld.shared.v2.f32 {f1008, f1009}, [r10+1224];
ld.shared.v2.f32 {f1012, f1013}, [r10+1296];
ld.shared.v2.f32 {f1016, f1017}, [r10+1368];
ld.shared.v2.f32 {f1020, f1021}, [r10+1440];
ld.shared.v2.f32 {f1024, f1025}, [r10+1512];
ld.shared.v2.f32 {f1028, f1029}, [r10+1584];
ld.shared.v2.f32 {f1032, f1033}, [r10+1656];
ld.shared.v2.f32 {f1036, f1037}, [r10+1728];
ld.shared.v2.f32 {f1040, f1041}, [r10+1800];
ld.shared.v2.f32 {f1044, f1045}, [r10+1872];
add.f32 f1048, f976, f1012;
add.f32 f1049, f940, f1048;
mul.f32 f1052, f1048, 0f3F000000;
sub.f32 f1053, f940, f1052;
add.f32 f1453, f977, f1013;
sub.f32 f1054, f977, f1013;
mul.f32 f1055, f1054, 0fBF5DB3D7;
add.f32 f1056, f1055, f1053;
sub.f32 f1057, f1053, f1055;
add.f32 f1452, f941, f1453;
mul.f32 f1058, f1453, 0f3F000000;
sub.f32 f1059, f941, f1058;
sub.f32 f1060, f976, f1012;
mul.f32 f1061, f1060, 0fBF5DB3D7;
sub.f32 f1062, f1059, f1061;
add.f32 f1063, f1061, f1059;
add.f32 f1064, f988, f1024;
add.f32 f1065, f952, f1064;
mul.f32 f1068, f1064, 0f3F000000;
sub.f32 f1069, f952, f1068;
add.f32 f1451, f989, f1025;
sub.f32 f1070, f989, f1025;
mul.f32 f1071, f1070, 0fBF5DB3D7;
add.f32 f1072, f1071, f1069;
sub.f32 f1073, f1069, f1071;
add.f32 f1450, f953, f1451;
mul.f32 f1074, f1451, 0f3F000000;
sub.f32 f1075, f953, f1074;
sub.f32 f1076, f988, f1024;
mul.f32 f1077, f1076, 0fBF5DB3D7;
sub.f32 f1078, f1075, f1077;
add.f32 f1079, f1077, f1075;
add.f32 f1080, f1000, f1036;
add.f32 f1081, f964, f1080;
mul.f32 f1084, f1080, 0f3F000000;
sub.f32 f1085, f964, f1084;
add.f32 f1449, f1001, f1037;
sub.f32 f1086, f1001, f1037;
mul.f32 f1087, f1086, 0fBF5DB3D7;
add.f32 f1088, f1087, f1085;
sub.f32 f1089, f1085, f1087;
add.f32 f1448, f965, f1449;
mul.f32 f1090, f1449, 0f3F000000;
sub.f32 f1091, f965, f1090;
sub.f32 f1092, f1000, f1036;
mul.f32 f1093, f1092, 0fBF5DB3D7;
sub.f32 f1094, f1091, f1093;
add.f32 f1095, f1093, f1091;
mul.f32 f1446, f1072, 0f3F441B7D;
mul.f32 f1447, f1078, 0f3F248DBB;
sub.f32 f1098, f1446, f1447;
mul.f32 f1099, f1078, 0f3F441B7D;
fma.rn.f32 f1100, f1072, 0f3F248DBB, f1099;
mul.f32 f1444, f1088, 0f3E31D0D4;
mul.f32 f1445, f1094, 0f3F7C1C5C;
sub.f32 f1103, f1444, f1445;
mul.f32 f1104, f1094, 0f3E31D0D4;
fma.rn.f32 f1105, f1088, 0f3F7C1C5C, f1104;
mul.f32 f1442, f1073, 0f3E31D0D4;
mul.f32 f1443, f1079, 0f3F7C1C5C;
sub.f32 f1108, f1442, f1443;
mul.f32 f1109, f1079, 0f3E31D0D4;
fma.rn.f32 f1110, f1073, 0f3F7C1C5C, f1109;
mul.f32 f1112, f1095, 0f3EAF1D44;
mul.f32 f1441, f1089, 0fBF708FB2;
sub.f32 f1113, f1441, f1112;
mul.f32 f1114, f1095, 0fBF708FB2;
fma.rn.f32 f1115, f1089, 0f3EAF1D44, f1114;
add.f32 f1116, f1065, f1081;
mul.f32 f1118, f1116, 0f3F000000;
sub.f32 f1119, f1049, f1118;
add.f32 f1440, f1450, f1448;
sub.f32 f1120, f1450, f1448;
mul.f32 f1121, f1120, 0fBF5DB3D7;
mul.f32 f1122, f1440, 0f3F000000;
sub.f32 f1123, f1452, f1122;
sub.f32 f1124, f1065, f1081;
mul.f32 f1125, f1124, 0fBF5DB3D7;
add.f32 f1126, f1098, f1103;
mul.f32 f1128, f1126, 0f3F000000;
sub.f32 f1129, f1056, f1128;
add.f32 f1439, f1100, f1105;
sub.f32 f1130, f1100, f1105;
mul.f32 f1131, f1130, 0fBF5DB3D7;
mul.f32 f1132, f1439, 0f3F000000;
sub.f32 f1133, f1062, f1132;
sub.f32 f1134, f1098, f1103;
mul.f32 f1135, f1134, 0fBF5DB3D7;
add.f32 f1136, f1108, f1113;
mul.f32 f1138, f1136, 0f3F000000;
sub.f32 f1139, f1057, f1138;
add.f32 f1438, f1110, f1115;
sub.f32 f1140, f1110, f1115;
mul.f32 f1141, f1140, 0fBF5DB3D7;
mul.f32 f1142, f1438, 0f3F000000;
sub.f32 f1143, f1063, f1142;
sub.f32 f1144, f1108, f1113;
mul.f32 f1145, f1144, 0fBF5DB3D7;
add.f32 f1146, f980, f1016;
add.f32 f1147, f944, f1146;
mul.f32 f1150, f1146, 0f3F000000;
sub.f32 f1151, f944, f1150;
add.f32 f1437, f981, f1017;
sub.f32 f1152, f981, f1017;
mul.f32 f1153, f1152, 0fBF5DB3D7;
add.f32 f1154, f1153, f1151;
sub.f32 f1155, f1151, f1153;
add.f32 f1436, f945, f1437;
mul.f32 f1156, f1437, 0f3F000000;
sub.f32 f1157, f945, f1156;
sub.f32 f1158, f980, f1016;
mul.f32 f1159, f1158, 0fBF5DB3D7;
sub.f32 f1160, f1157, f1159;
add.f32 f1161, f1159, f1157;
add.f32 f1162, f992, f1028;
add.f32 f1163, f956, f1162;
mul.f32 f1166, f1162, 0f3F000000;
sub.f32 f1167, f956, f1166;
add.f32 f1435, f993, f1029;
sub.f32 f1168, f993, f1029;
mul.f32 f1169, f1168, 0fBF5DB3D7;
add.f32 f1170, f1169, f1167;
sub.f32 f1171, f1167, f1169;
add.f32 f1434, f957, f1435;
mul.f32 f1172, f1435, 0f3F000000;
sub.f32 f1173, f957, f1172;
sub.f32 f1174, f992, f1028;
mul.f32 f1175, f1174, 0fBF5DB3D7;
sub.f32 f1176, f1173, f1175;
add.f32 f1177, f1175, f1173;
add.f32 f1178, f1004, f1040;
add.f32 f1179, f968, f1178;
mul.f32 f1182, f1178, 0f3F000000;
sub.f32 f1183, f968, f1182;
add.f32 f1433, f1005, f1041;
sub.f32 f1184, f1005, f1041;
mul.f32 f1185, f1184, 0fBF5DB3D7;
add.f32 f1186, f1185, f1183;
sub.f32 f1187, f1183, f1185;
add.f32 f1432, f969, f1433;
mul.f32 f1188, f1433, 0f3F000000;
sub.f32 f1189, f969, f1188;
sub.f32 f1190, f1004, f1040;
mul.f32 f1191, f1190, 0fBF5DB3D7;
sub.f32 f1192, f1189, f1191;
add.f32 f1193, f1191, f1189;
mul.f32 f1195, f1176, 0f3F248DBB;
mul.f32 f1431, f1170, 0f3F441B7D;
sub.f32 f1196, f1431, f1195;
mul.f32 f1197, f1176, 0f3F441B7D;
fma.rn.f32 f1198, f1170, 0f3F248DBB, f1197;
mul.f32 f1200, f1192, 0f3F7C1C5C;
mul.f32 f1430, f1186, 0f3E31D0D4;
sub.f32 f1201, f1430, f1200;
mul.f32 f1202, f1192, 0f3E31D0D4;
fma.rn.f32 f1203, f1186, 0f3F7C1C5C, f1202;
mul.f32 f1205, f1177, 0f3F7C1C5C;
mul.f32 f1429, f1171, 0f3E31D0D4;
sub.f32 f1206, f1429, f1205;
mul.f32 f1207, f1177, 0f3E31D0D4;
fma.rn.f32 f1208, f1171, 0f3F7C1C5C, f1207;
mul.f32 f1210, f1193, 0f3EAF1D44;
mul.f32 f1428, f1187, 0fBF708FB2;
sub.f32 f1211, f1428, f1210;
mul.f32 f1212, f1193, 0fBF708FB2;
fma.rn.f32 f1213, f1187, 0f3EAF1D44, f1212;
add.f32 f1214, f1163, f1179;
mul.f32 f1216, f1214, 0f3F000000;
sub.f32 f1217, f1147, f1216;
add.f32 f1427, f1434, f1432;
sub.f32 f1218, f1434, f1432;
mul.f32 f1219, f1218, 0fBF5DB3D7;
mul.f32 f1220, f1427, 0f3F000000;
sub.f32 f1221, f1436, f1220;
sub.f32 f1222, f1163, f1179;
mul.f32 f1223, f1222, 0fBF5DB3D7;
add.f32 f1224, f1196, f1201;
mul.f32 f1226, f1224, 0f3F000000;
sub.f32 f1227, f1154, f1226;
add.f32 f1426, f1198, f1203;
sub.f32 f1228, f1198, f1203;
mul.f32 f1229, f1228, 0fBF5DB3D7;
mul.f32 f1230, f1426, 0f3F000000;
sub.f32 f1231, f1160, f1230;
sub.f32 f1232, f1196, f1201;
mul.f32 f1233, f1232, 0fBF5DB3D7;
add.f32 f1234, f1206, f1211;
mul.f32 f1236, f1234, 0f3F000000;
sub.f32 f1237, f1155, f1236;
add.f32 f1425, f1208, f1213;
sub.f32 f1238, f1208, f1213;
mul.f32 f1239, f1238, 0fBF5DB3D7;
mul.f32 f1240, f1425, 0f3F000000;
sub.f32 f1241, f1161, f1240;
sub.f32 f1242, f1206, f1211;
mul.f32 f1243, f1242, 0fBF5DB3D7;
add.f32 f1244, f984, f1020;
add.f32 f1245, f948, f1244;
mul.f32 f1248, f1244, 0f3F000000;
sub.f32 f1249, f948, f1248;
add.f32 f1424, f985, f1021;
sub.f32 f1250, f985, f1021;
mul.f32 f1251, f1250, 0fBF5DB3D7;
add.f32 f1252, f1251, f1249;
sub.f32 f1253, f1249, f1251;
add.f32 f1423, f949, f1424;
mul.f32 f1254, f1424, 0f3F000000;
sub.f32 f1255, f949, f1254;
sub.f32 f1256, f984, f1020;
mul.f32 f1257, f1256, 0fBF5DB3D7;
sub.f32 f1258, f1255, f1257;
add.f32 f1259, f1257, f1255;
add.f32 f1260, f996, f1032;
add.f32 f1261, f960, f1260;
mul.f32 f1264, f1260, 0f3F000000;
sub.f32 f1265, f960, f1264;
add.f32 f1422, f997, f1033;
sub.f32 f1266, f997, f1033;
mul.f32 f1267, f1266, 0fBF5DB3D7;
add.f32 f1268, f1267, f1265;
sub.f32 f1269, f1265, f1267;
add.f32 f1421, f961, f1422;
mul.f32 f1270, f1422, 0f3F000000;
sub.f32 f1271, f961, f1270;
sub.f32 f1272, f996, f1032;
mul.f32 f1273, f1272, 0fBF5DB3D7;
sub.f32 f1274, f1271, f1273;
add.f32 f1275, f1273, f1271;
add.f32 f1276, f1008, f1044;
add.f32 f1277, f972, f1276;
mul.f32 f1280, f1276, 0f3F000000;
sub.f32 f1281, f972, f1280;
add.f32 f1420, f1009, f1045;
sub.f32 f1282, f1009, f1045;
mul.f32 f1283, f1282, 0fBF5DB3D7;
add.f32 f1284, f1283, f1281;
sub.f32 f1285, f1281, f1283;
add.f32 f1419, f973, f1420;
mul.f32 f1286, f1420, 0f3F000000;
sub.f32 f1287, f973, f1286;
sub.f32 f1288, f1008, f1044;
mul.f32 f1289, f1288, 0fBF5DB3D7;
sub.f32 f1290, f1287, f1289;
add.f32 f1291, f1289, f1287;
mul.f32 f1293, f1274, 0f3F248DBB;
mul.f32 f1418, f1268, 0f3F441B7D;
sub.f32 f1294, f1418, f1293;
mul.f32 f1295, f1274, 0f3F441B7D;
fma.rn.f32 f1296, f1268, 0f3F248DBB, f1295;
mul.f32 f1298, f1290, 0f3F7C1C5C;
mul.f32 f1417, f1284, 0f3E31D0D4;
sub.f32 f1299, f1417, f1298;
mul.f32 f1300, f1290, 0f3E31D0D4;
fma.rn.f32 f1301, f1284, 0f3F7C1C5C, f1300;
mul.f32 f1415, f1269, 0f3E31D0D4;
mul.f32 f1416, f1275, 0f3F7C1C5C;
sub.f32 f1304, f1415, f1416;
mul.f32 f1305, f1275, 0f3E31D0D4;
fma.rn.f32 f1306, f1269, 0f3F7C1C5C, f1305;
mul.f32 f1413, f1285, 0fBF708FB2;
mul.f32 f1414, f1291, 0f3EAF1D44;
sub.f32 f1309, f1413, f1414;
mul.f32 f1310, f1291, 0fBF708FB2;
fma.rn.f32 f1311, f1285, 0f3EAF1D44, f1310;
add.f32 f1312, f1261, f1277;
mul.f32 f1314, f1312, 0f3F000000;
sub.f32 f1315, f1245, f1314;
add.f32 f1412, f1421, f1419;
sub.f32 f1316, f1421, f1419;
mul.f32 f1317, f1316, 0fBF5DB3D7;
mul.f32 f1318, f1412, 0f3F000000;
sub.f32 f1319, f1423, f1318;
sub.f32 f1320, f1261, f1277;
mul.f32 f1321, f1320, 0fBF5DB3D7;
add.f32 f1322, f1294, f1299;
mul.f32 f1324, f1322, 0f3F000000;
sub.f32 f1325, f1252, f1324;
add.f32 f1411, f1296, f1301;
sub.f32 f1326, f1296, f1301;
mul.f32 f1327, f1326, 0fBF5DB3D7;
mul.f32 f1328, f1411, 0f3F000000;
sub.f32 f1329, f1258, f1328;
sub.f32 f1330, f1294, f1299;
mul.f32 f1331, f1330, 0fBF5DB3D7;
add.f32 f1332, f1304, f1309;
mul.f32 f1334, f1332, 0f3F000000;
sub.f32 f1335, f1253, f1334;
add.f32 f1410, f1306, f1311;
sub.f32 f1336, f1306, f1311;
mul.f32 f1337, f1336, 0fBF5DB3D7;
mul.f32 f1338, f1410, 0f3F000000;
sub.f32 f1339, f1259, f1338;
sub.f32 f1340, f1304, f1309;
mul.f32 f1341, f1340, 0fBF5DB3D7;
add.f32 %1, f1452, f1440;
add.f32 %0, f1049, f1116;
add.f32 %3, f1436, f1427;
add.f32 %2, f1147, f1214;
add.f32 %5, f1423, f1412;
add.f32 %4, f1245, f1312;
add.f32 %7, f1062, f1439;
add.f32 %6, f1056, f1126;
add.f32 %9, f1160, f1426;
add.f32 %8, f1154, f1224;
add.f32 %11, f1258, f1411;
add.f32 %10, f1252, f1322;
add.f32 %13, f1063, f1438;
add.f32 %12, f1057, f1136;
add.f32 %15, f1161, f1425;
add.f32 %14, f1155, f1234;
add.f32 %17, f1259, f1410;
add.f32 %16, f1253, f1332;
add.f32 %18, f1121, f1119;
sub.f32 %19, f1123, f1125;
sub.f32 %21, f1221, f1223;
add.f32 %20, f1219, f1217;
sub.f32 %23, f1319, f1321;
add.f32 %22, f1317, f1315;
sub.f32 %25, f1133, f1135;
add.f32 %24, f1131, f1129;
add.f32 %26, f1229, f1227;
sub.f32 %27, f1231, f1233;
add.f32 %28, f1327, f1325;
sub.f32 %29, f1329, f1331;
add.f32 %30, f1141, f1139;
sub.f32 %31, f1143, f1145;
sub.f32 %33, f1241, f1243;
add.f32 %32, f1239, f1237;
sub.f32 %35, f1339, f1341;
add.f32 %34, f1337, f1335;
add.f32 %37, f1125, f1123;
sub.f32 %36, f1119, f1121;
add.f32 %39, f1223, f1221;
sub.f32 %38, f1217, f1219;
add.f32 %41, f1321, f1319;
sub.f32 %40, f1315, f1317;
add.f32 %43, f1135, f1133;
sub.f32 %42, f1129, f1131;
add.f32 %45, f1233, f1231;
sub.f32 %44, f1227, f1229;
add.f32 %47, f1331, f1329;
sub.f32 %46, f1325, f1327;
add.f32 %49, f1145, f1143;
sub.f32 %48, f1139, f1141;
add.f32 %51, f1243, f1241;
sub.f32 %50, f1237, f1239;
add.f32 %53, f1341, f1339;
sub.f32 %52, f1335, f1337;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y): "r"(smem), "l"(lut_sp_27_243), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[19].y), "f"(rmem[10].y), "f"(rmem[1].y), "f"(rmem[22].y), "f"(rmem[13].y), "f"(rmem[4].y), "f"(rmem[16].y), "f"(rmem[25].y), "f"(rmem[7].y), "f"(rmem[11].y), "f"(rmem[20].y), "f"(rmem[2].y), "f"(rmem[23].y), "f"(rmem[14].y), "f"(rmem[5].y), "f"(rmem[17].y), "f"(rmem[8].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<335, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<1555>;
.reg .b32 r<14>;
.reg .b64 rd<8>;
mov.u32 r12, %tid.y;
mov.u32 r13, %54;
mad.lo.s32 r3, r12, 972, r13;
add.f32 f109, %74, %92;
add.f32 f110, %56, f109;
mul.f32 f113, f109, 0f3F000000;
sub.f32 f114, %56, f113;
add.f32 f1546, %75, %93;
sub.f32 f115, %75, %93;
mul.f32 f116, f115, 0fBF5DB3D7;
add.f32 f117, f116, f114;
sub.f32 f118, f114, f116;
add.f32 f1545, %57, f1546;
mul.f32 f119, f1546, 0f3F000000;
sub.f32 f120, %57, f119;
sub.f32 f121, %74, %92;
mul.f32 f122, f121, 0fBF5DB3D7;
sub.f32 f123, f120, f122;
add.f32 f124, f122, f120;
add.f32 f125, %80, %98;
add.f32 f126, %62, f125;
mul.f32 f129, f125, 0f3F000000;
sub.f32 f130, %62, f129;
add.f32 f1544, %81, %99;
sub.f32 f131, %81, %99;
mul.f32 f132, f131, 0fBF5DB3D7;
add.f32 f133, f132, f130;
sub.f32 f134, f130, f132;
add.f32 f1543, %63, f1544;
mul.f32 f135, f1544, 0f3F000000;
sub.f32 f136, %63, f135;
sub.f32 f137, %80, %98;
mul.f32 f138, f137, 0fBF5DB3D7;
sub.f32 f139, f136, f138;
add.f32 f140, f138, f136;
add.f32 f141, %86, %104;
add.f32 f142, %68, f141;
mul.f32 f145, f141, 0f3F000000;
sub.f32 f146, %68, f145;
add.f32 f1542, %87, %105;
sub.f32 f147, %87, %105;
mul.f32 f148, f147, 0fBF5DB3D7;
add.f32 f149, f148, f146;
sub.f32 f150, f146, f148;
add.f32 f1541, %69, f1542;
mul.f32 f151, f1542, 0f3F000000;
sub.f32 f152, %69, f151;
sub.f32 f153, %86, %104;
mul.f32 f154, f153, 0fBF5DB3D7;
sub.f32 f155, f152, f154;
add.f32 f156, f154, f152;
mul.f32 f158, f139, 0f3F248DBB;
mul.f32 f1540, f133, 0f3F441B7D;
sub.f32 f159, f1540, f158;
mul.f32 f160, f139, 0f3F441B7D;
fma.rn.f32 f161, f133, 0f3F248DBB, f160;
mul.f32 f1538, f149, 0f3E31D0D4;
mul.f32 f1539, f155, 0f3F7C1C5C;
sub.f32 f164, f1538, f1539;
mul.f32 f165, f155, 0f3E31D0D4;
fma.rn.f32 f166, f149, 0f3F7C1C5C, f165;
mul.f32 f1536, f134, 0f3E31D0D4;
mul.f32 f1537, f140, 0f3F7C1C5C;
sub.f32 f169, f1536, f1537;
mul.f32 f170, f140, 0f3E31D0D4;
fma.rn.f32 f171, f134, 0f3F7C1C5C, f170;
mul.f32 f1534, f150, 0fBF708FB2;
mul.f32 f1535, f156, 0f3EAF1D44;
sub.f32 f174, f1534, f1535;
mul.f32 f175, f156, 0fBF708FB2;
fma.rn.f32 f176, f150, 0f3EAF1D44, f175;
add.f32 f177, f126, f142;
add.f32 f178, f110, f177;
mul.f32 f181, f177, 0f3F000000;
sub.f32 f182, f110, f181;
add.f32 f1533, f1543, f1541;
sub.f32 f183, f1543, f1541;
mul.f32 f184, f183, 0fBF5DB3D7;
add.f32 f185, f184, f182;
sub.f32 f186, f182, f184;
add.f32 f1532, f1545, f1533;
mul.f32 f187, f1533, 0f3F000000;
sub.f32 f188, f1545, f187;
sub.f32 f189, f126, f142;
mul.f32 f190, f189, 0fBF5DB3D7;
sub.f32 f191, f188, f190;
add.f32 f192, f190, f188;
add.f32 f193, f159, f164;
add.f32 f194, f117, f193;
mul.f32 f197, f193, 0f3F000000;
sub.f32 f198, f117, f197;
add.f32 f1531, f161, f166;
sub.f32 f199, f161, f166;
mul.f32 f200, f199, 0fBF5DB3D7;
add.f32 f201, f200, f198;
sub.f32 f202, f198, f200;
add.f32 f1530, f123, f1531;
mul.f32 f203, f1531, 0f3F000000;
sub.f32 f204, f123, f203;
sub.f32 f205, f159, f164;
mul.f32 f206, f205, 0fBF5DB3D7;
sub.f32 f207, f204, f206;
add.f32 f208, f206, f204;
add.f32 f209, f169, f174;
add.f32 f210, f118, f209;
mul.f32 f213, f209, 0f3F000000;
sub.f32 f214, f118, f213;
add.f32 f1529, f171, f176;
sub.f32 f215, f171, f176;
mul.f32 f216, f215, 0fBF5DB3D7;
add.f32 f217, f216, f214;
sub.f32 f218, f214, f216;
add.f32 f1528, f124, f1529;
mul.f32 f219, f1529, 0f3F000000;
sub.f32 f220, f124, f219;
sub.f32 f221, f169, f174;
mul.f32 f222, f221, 0fBF5DB3D7;
sub.f32 f223, f220, f222;
add.f32 f224, f222, f220;
add.f32 f225, %76, %94;
add.f32 f226, %58, f225;
mul.f32 f229, f225, 0f3F000000;
sub.f32 f230, %58, f229;
add.f32 f1525, %111, %110;
sub.f32 f231, %111, %110;
mul.f32 f232, f231, 0fBF5DB3D7;
add.f32 f233, f232, f230;
sub.f32 f234, f230, f232;
add.f32 f1523, %112, f1525;
mul.f32 f235, f1525, 0f3F000000;
sub.f32 f236, %112, f235;
sub.f32 f237, %76, %94;
mul.f32 f238, f237, 0fBF5DB3D7;
sub.f32 f239, f236, f238;
add.f32 f240, f238, f236;
add.f32 f241, %82, %100;
add.f32 f242, %64, f241;
mul.f32 f245, f241, 0f3F000000;
sub.f32 f246, %64, f245;
add.f32 f1520, %114, %113;
sub.f32 f247, %114, %113;
mul.f32 f248, f247, 0fBF5DB3D7;
add.f32 f249, f248, f246;
sub.f32 f250, f246, f248;
add.f32 f1518, %115, f1520;
mul.f32 f251, f1520, 0f3F000000;
sub.f32 f252, %115, f251;
sub.f32 f253, %82, %100;
mul.f32 f254, f253, 0fBF5DB3D7;
sub.f32 f255, f252, f254;
add.f32 f256, f254, f252;
add.f32 f257, %88, %106;
add.f32 f258, %70, f257;
mul.f32 f261, f257, 0f3F000000;
sub.f32 f262, %70, f261;
add.f32 f1515, %116, %117;
sub.f32 f263, %116, %117;
mul.f32 f264, f263, 0fBF5DB3D7;
add.f32 f265, f264, f262;
sub.f32 f266, f262, f264;
add.f32 f1513, %118, f1515;
mul.f32 f267, f1515, 0f3F000000;
sub.f32 f268, %118, f267;
sub.f32 f269, %88, %106;
mul.f32 f270, f269, 0fBF5DB3D7;
sub.f32 f271, f268, f270;
add.f32 f272, f270, f268;
mul.f32 f274, f255, 0f3F248DBB;
mul.f32 f1512, f249, 0f3F441B7D;
sub.f32 f275, f1512, f274;
mul.f32 f276, f255, 0f3F441B7D;
fma.rn.f32 f277, f249, 0f3F248DBB, f276;
mul.f32 f279, f271, 0f3F7C1C5C;
mul.f32 f1511, f265, 0f3E31D0D4;
sub.f32 f280, f1511, f279;
mul.f32 f281, f271, 0f3E31D0D4;
fma.rn.f32 f282, f265, 0f3F7C1C5C, f281;
mul.f32 f1509, f250, 0f3E31D0D4;
mul.f32 f1510, f256, 0f3F7C1C5C;
sub.f32 f285, f1509, f1510;
mul.f32 f286, f256, 0f3E31D0D4;
fma.rn.f32 f287, f250, 0f3F7C1C5C, f286;
mul.f32 f1507, f266, 0fBF708FB2;
mul.f32 f1508, f272, 0f3EAF1D44;
sub.f32 f290, f1507, f1508;
mul.f32 f291, f272, 0fBF708FB2;
fma.rn.f32 f292, f266, 0f3EAF1D44, f291;
add.f32 f293, f242, f258;
add.f32 f294, f226, f293;
mul.f32 f297, f293, 0f3F000000;
sub.f32 f298, f226, f297;
add.f32 f1506, f1518, f1513;
sub.f32 f299, f1518, f1513;
mul.f32 f300, f299, 0fBF5DB3D7;
add.f32 f301, f300, f298;
sub.f32 f302, f298, f300;
add.f32 f1505, f1523, f1506;
mul.f32 f303, f1506, 0f3F000000;
sub.f32 f304, f1523, f303;
sub.f32 f305, f242, f258;
mul.f32 f306, f305, 0fBF5DB3D7;
sub.f32 f307, f304, f306;
add.f32 f308, f306, f304;
add.f32 f309, f275, f280;
add.f32 f310, f233, f309;
mul.f32 f313, f309, 0f3F000000;
sub.f32 f314, f233, f313;
add.f32 f1504, f277, f282;
sub.f32 f315, f277, f282;
mul.f32 f316, f315, 0fBF5DB3D7;
add.f32 f317, f316, f314;
sub.f32 f318, f314, f316;
add.f32 f1503, f239, f1504;
mul.f32 f319, f1504, 0f3F000000;
sub.f32 f320, f239, f319;
sub.f32 f321, f275, f280;
mul.f32 f322, f321, 0fBF5DB3D7;
sub.f32 f323, f320, f322;
add.f32 f324, f322, f320;
add.f32 f325, f285, f290;
add.f32 f326, f234, f325;
mul.f32 f329, f325, 0f3F000000;
sub.f32 f330, f234, f329;
add.f32 f1502, f287, f292;
sub.f32 f331, f287, f292;
mul.f32 f332, f331, 0fBF5DB3D7;
add.f32 f333, f332, f330;
sub.f32 f334, f330, f332;
add.f32 f1501, f240, f1502;
mul.f32 f335, f1502, 0f3F000000;
sub.f32 f336, f240, f335;
sub.f32 f337, f285, f290;
mul.f32 f338, f337, 0fBF5DB3D7;
sub.f32 f339, f336, f338;
add.f32 f340, f338, f336;
add.f32 f341, %78, %96;
add.f32 f342, %60, f341;
mul.f32 f345, f341, 0f3F000000;
sub.f32 f346, %60, f345;
add.f32 f1498, %119, %120;
sub.f32 f347, %119, %120;
mul.f32 f348, f347, 0fBF5DB3D7;
add.f32 f349, f348, f346;
sub.f32 f350, f346, f348;
add.f32 f1496, %121, f1498;
mul.f32 f351, f1498, 0f3F000000;
sub.f32 f352, %121, f351;
sub.f32 f353, %78, %96;
mul.f32 f354, f353, 0fBF5DB3D7;
sub.f32 f355, f352, f354;
add.f32 f356, f354, f352;
add.f32 f357, %84, %102;
add.f32 f358, %66, f357;
mul.f32 f361, f357, 0f3F000000;
sub.f32 f362, %66, f361;
add.f32 f1493, %123, %122;
sub.f32 f363, %123, %122;
mul.f32 f364, f363, 0fBF5DB3D7;
add.f32 f365, f364, f362;
sub.f32 f366, f362, f364;
add.f32 f1491, %124, f1493;
mul.f32 f367, f1493, 0f3F000000;
sub.f32 f368, %124, f367;
sub.f32 f369, %84, %102;
mul.f32 f370, f369, 0fBF5DB3D7;
sub.f32 f371, f368, f370;
add.f32 f372, f370, f368;
add.f32 f373, %90, %108;
add.f32 f374, %72, f373;
mul.f32 f377, f373, 0f3F000000;
sub.f32 f378, %72, f377;
add.f32 f1489, %125, %109;
sub.f32 f379, %125, %109;
mul.f32 f380, f379, 0fBF5DB3D7;
add.f32 f381, f380, f378;
sub.f32 f382, f378, f380;
add.f32 f1487, %126, f1489;
mul.f32 f383, f1489, 0f3F000000;
sub.f32 f384, %126, f383;
sub.f32 f385, %90, %108;
mul.f32 f386, f385, 0fBF5DB3D7;
sub.f32 f387, f384, f386;
add.f32 f388, f386, f384;
mul.f32 f390, f371, 0f3F248DBB;
mul.f32 f1486, f365, 0f3F441B7D;
sub.f32 f391, f1486, f390;
mul.f32 f392, f371, 0f3F441B7D;
fma.rn.f32 f393, f365, 0f3F248DBB, f392;
mul.f32 f395, f387, 0f3F7C1C5C;
mul.f32 f1485, f381, 0f3E31D0D4;
sub.f32 f396, f1485, f395;
mul.f32 f397, f387, 0f3E31D0D4;
fma.rn.f32 f398, f381, 0f3F7C1C5C, f397;
mul.f32 f1483, f366, 0f3E31D0D4;
mul.f32 f1484, f372, 0f3F7C1C5C;
sub.f32 f401, f1483, f1484;
mul.f32 f402, f372, 0f3E31D0D4;
fma.rn.f32 f403, f366, 0f3F7C1C5C, f402;
mul.f32 f1481, f382, 0fBF708FB2;
mul.f32 f1482, f388, 0f3EAF1D44;
sub.f32 f406, f1481, f1482;
mul.f32 f407, f388, 0fBF708FB2;
fma.rn.f32 f408, f382, 0f3EAF1D44, f407;
add.f32 f409, f358, f374;
add.f32 f410, f342, f409;
mul.f32 f413, f409, 0f3F000000;
sub.f32 f414, f342, f413;
add.f32 f1480, f1491, f1487;
sub.f32 f415, f1491, f1487;
mul.f32 f416, f415, 0fBF5DB3D7;
add.f32 f417, f416, f414;
sub.f32 f418, f414, f416;
add.f32 f1479, f1496, f1480;
mul.f32 f419, f1480, 0f3F000000;
sub.f32 f420, f1496, f419;
sub.f32 f421, f358, f374;
mul.f32 f422, f421, 0fBF5DB3D7;
sub.f32 f423, f420, f422;
add.f32 f424, f422, f420;
add.f32 f425, f391, f396;
add.f32 f426, f349, f425;
mul.f32 f429, f425, 0f3F000000;
sub.f32 f430, f349, f429;
add.f32 f1478, f393, f398;
sub.f32 f431, f393, f398;
mul.f32 f432, f431, 0fBF5DB3D7;
add.f32 f433, f432, f430;
sub.f32 f434, f430, f432;
add.f32 f1477, f355, f1478;
mul.f32 f435, f1478, 0f3F000000;
sub.f32 f436, f355, f435;
sub.f32 f437, f391, f396;
mul.f32 f438, f437, 0fBF5DB3D7;
sub.f32 f439, f436, f438;
add.f32 f440, f438, f436;
add.f32 f441, f401, f406;
add.f32 f442, f350, f441;
mul.f32 f445, f441, 0f3F000000;
sub.f32 f446, f350, f445;
add.f32 f1476, f403, f408;
sub.f32 f447, f403, f408;
mul.f32 f448, f447, 0fBF5DB3D7;
add.f32 f449, f448, f446;
sub.f32 f450, f446, f448;
add.f32 f1475, f356, f1476;
mul.f32 f451, f1476, 0f3F000000;
sub.f32 f452, f356, f451;
sub.f32 f453, f401, f406;
mul.f32 f454, f453, 0fBF5DB3D7;
sub.f32 f455, f452, f454;
add.f32 f456, f454, f452;
mul.f32 f458, f1503, 0f3E6C2691;
mul.f32 f1474, f310, 0f3F791978;
sub.f32 f459, f1474, f458;
mul.f32 f460, f1503, 0f3F791978;
fma.rn.f32 f461, f310, 0f3E6C2691, f460;
mul.f32 f1472, f426, 0f3F64C51C;
mul.f32 f1473, f1477, 0f3EE5C902;
sub.f32 f464, f1472, f1473;
mul.f32 f465, f1477, 0f3F64C51C;
fma.rn.f32 f466, f426, 0f3EE5C902, f465;
mul.f32 f1470, f326, 0f3F64C51C;
mul.f32 f1471, f1501, 0f3EE5C902;
sub.f32 f469, f1470, f1471;
mul.f32 f470, f1501, 0f3F64C51C;
fma.rn.f32 f471, f326, 0f3EE5C902, f470;
mul.f32 f1468, f442, 0f3F18DF63;
mul.f32 f1469, f1475, 0f3F4D57F2;
sub.f32 f474, f1468, f1469;
mul.f32 f475, f1475, 0f3F18DF63;
fma.rn.f32 f476, f442, 0f3F4D57F2, f475;
mul.f32 f1466, f301, 0f3F441B7D;
mul.f32 f1467, f307, 0f3F248DBB;
sub.f32 f479, f1466, f1467;
mul.f32 f480, f307, 0f3F441B7D;
fma.rn.f32 f481, f301, 0f3F248DBB, f480;
mul.f32 f483, f423, 0f3F7C1C5C;
mul.f32 f1465, f417, 0f3E31D0D4;
sub.f32 f484, f1465, f483;
mul.f32 f485, f423, 0f3E31D0D4;
fma.rn.f32 f486, f417, 0f3F7C1C5C, f485;
mul.f32 f488, f323, 0f3F4D57F2;
mul.f32 f1464, f317, 0f3F18DF63;
sub.f32 f489, f1464, f488;
mul.f32 f490, f323, 0f3F18DF63;
fma.rn.f32 f491, f317, 0f3F4D57F2, f490;
mul.f32 f493, f439, 0f3F753ECD;
mul.f32 f1463, f433, 0fBE92D7E0;
sub.f32 f494, f1463, f493;
mul.f32 f495, f439, 0fBE92D7E0;
fma.rn.f32 f496, f433, 0f3F753ECD, f495;
mul.f32 f498, f339, 0f3F6B1036;
mul.f32 f1462, f333, 0f3ECACAF8;
sub.f32 f499, f1462, f498;
mul.f32 f500, f339, 0f3ECACAF8;
fma.rn.f32 f501, f333, 0f3F6B1036, f500;
mul.f32 f503, f455, 0f3F3A3529;
mul.f32 f1461, f449, 0fBF2FAD88;
sub.f32 f504, f1461, f503;
mul.f32 f505, f455, 0fBF2FAD88;
fma.rn.f32 f506, f449, 0f3F3A3529, f505;
mul.f32 f508, f308, 0f3F7C1C5C;
mul.f32 f1460, f302, 0f3E31D0D4;
sub.f32 f509, f1460, f508;
mul.f32 f510, f308, 0f3E31D0D4;
fma.rn.f32 f511, f302, 0f3F7C1C5C, f510;
mul.f32 f1458, f418, 0fBF708FB2;
mul.f32 f1459, f424, 0f3EAF1D44;
sub.f32 f514, f1458, f1459;
mul.f32 f515, f424, 0fBF708FB2;
fma.rn.f32 f516, f418, 0f3EAF1D44, f515;
mul.f32 f1456, f318, 0fBD6E2946;
mul.f32 f1457, f324, 0f3F7F9120;
sub.f32 f519, f1456, f1457;
mul.f32 f520, f324, 0fBD6E2946;
fma.rn.f32 f521, f318, 0f3F7F9120, f520;
mul.f32 f1454, f434, 0fBF7E44DE;
mul.f32 f1455, f440, 0fBDEDC21F;
sub.f32 f524, f1454, f1455;
mul.f32 f525, f440, 0fBF7E44DE;
fma.rn.f32 f526, f434, 0fBDEDC21F, f525;
mul.f32 f528, f340, 0f3F753ECD;
mul.f32 f1453, f334, 0fBE92D7E0;
sub.f32 f529, f1453, f528;
mul.f32 f530, f340, 0fBE92D7E0;
fma.rn.f32 f531, f334, 0f3F753ECD, f530;
mul.f32 f533, f456, 0fBF0CAC9F;
mul.f32 f1452, f450, 0fBF55E287;
sub.f32 f534, f1452, f533;
mul.f32 f535, f456, 0fBF55E287;
fma.rn.f32 f536, f450, 0fBF0CAC9F, f535;
add.f32 f537, f294, f410;
add.f32 f538, f178, f537;
mul.f32 f541, f537, 0f3F000000;
sub.f32 f542, f178, f541;
add.f32 f1451, f1505, f1479;
sub.f32 f543, f1505, f1479;
mul.f32 f544, f543, 0fBF5DB3D7;
add.f32 f545, f544, f542;
sub.f32 f546, f542, f544;
add.f32 f1450, f1532, f1451;
mul.f32 f547, f1451, 0f3F000000;
sub.f32 f548, f1532, f547;
sub.f32 f549, f294, f410;
mul.f32 f550, f549, 0fBF5DB3D7;
sub.f32 f551, f548, f550;
add.f32 f552, f550, f548;
add.f32 f553, f459, f464;
add.f32 f554, f194, f553;
mul.f32 f557, f553, 0f3F000000;
sub.f32 f558, f194, f557;
add.f32 f1449, f461, f466;
sub.f32 f559, f461, f466;
mul.f32 f560, f559, 0fBF5DB3D7;
add.f32 f561, f560, f558;
sub.f32 f562, f558, f560;
add.f32 f1448, f1530, f1449;
mul.f32 f563, f1449, 0f3F000000;
sub.f32 f564, f1530, f563;
sub.f32 f565, f459, f464;
mul.f32 f566, f565, 0fBF5DB3D7;
sub.f32 f567, f564, f566;
add.f32 f568, f566, f564;
add.f32 f569, f469, f474;
add.f32 f570, f210, f569;
mul.f32 f573, f569, 0f3F000000;
sub.f32 f574, f210, f573;
add.f32 f1447, f471, f476;
sub.f32 f575, f471, f476;
mul.f32 f576, f575, 0fBF5DB3D7;
add.f32 f577, f576, f574;
sub.f32 f578, f574, f576;
add.f32 f1446, f1528, f1447;
mul.f32 f579, f1447, 0f3F000000;
sub.f32 f580, f1528, f579;
sub.f32 f581, f469, f474;
mul.f32 f582, f581, 0fBF5DB3D7;
sub.f32 f583, f580, f582;
add.f32 f584, f582, f580;
add.f32 f585, f479, f484;
add.f32 f586, f185, f585;
mul.f32 f589, f585, 0f3F000000;
sub.f32 f590, f185, f589;
add.f32 f1445, f481, f486;
sub.f32 f591, f481, f486;
mul.f32 f592, f591, 0fBF5DB3D7;
add.f32 f593, f592, f590;
sub.f32 f594, f590, f592;
add.f32 f1444, f191, f1445;
mul.f32 f595, f1445, 0f3F000000;
sub.f32 f596, f191, f595;
sub.f32 f597, f479, f484;
mul.f32 f598, f597, 0fBF5DB3D7;
sub.f32 f599, f596, f598;
add.f32 f600, f598, f596;
add.f32 f601, f489, f494;
add.f32 f602, f201, f601;
mul.f32 f605, f601, 0f3F000000;
sub.f32 f606, f201, f605;
add.f32 f1443, f491, f496;
sub.f32 f607, f491, f496;
mul.f32 f608, f607, 0fBF5DB3D7;
add.f32 f609, f608, f606;
sub.f32 f610, f606, f608;
add.f32 f1442, f207, f1443;
mul.f32 f611, f1443, 0f3F000000;
sub.f32 f612, f207, f611;
sub.f32 f613, f489, f494;
mul.f32 f614, f613, 0fBF5DB3D7;
sub.f32 f615, f612, f614;
add.f32 f616, f614, f612;
add.f32 f617, f499, f504;
add.f32 f618, f217, f617;
mul.f32 f621, f617, 0f3F000000;
sub.f32 f622, f217, f621;
add.f32 f1441, f501, f506;
sub.f32 f623, f501, f506;
mul.f32 f624, f623, 0fBF5DB3D7;
add.f32 f625, f624, f622;
sub.f32 f626, f622, f624;
add.f32 f1440, f223, f1441;
mul.f32 f627, f1441, 0f3F000000;
sub.f32 f628, f223, f627;
sub.f32 f629, f499, f504;
mul.f32 f630, f629, 0fBF5DB3D7;
sub.f32 f631, f628, f630;
add.f32 f632, f630, f628;
add.f32 f633, f509, f514;
add.f32 f634, f186, f633;
mul.f32 f637, f633, 0f3F000000;
sub.f32 f638, f186, f637;
add.f32 f1439, f511, f516;
sub.f32 f639, f511, f516;
mul.f32 f640, f639, 0fBF5DB3D7;
add.f32 f641, f640, f638;
sub.f32 f642, f638, f640;
add.f32 f1438, f192, f1439;
mul.f32 f643, f1439, 0f3F000000;
sub.f32 f644, f192, f643;
sub.f32 f645, f509, f514;
mul.f32 f646, f645, 0fBF5DB3D7;
sub.f32 f647, f644, f646;
add.f32 f648, f646, f644;
add.f32 f649, f519, f524;
add.f32 f650, f202, f649;
mul.f32 f653, f649, 0f3F000000;
sub.f32 f654, f202, f653;
add.f32 f1437, f521, f526;
sub.f32 f655, f521, f526;
mul.f32 f656, f655, 0fBF5DB3D7;
add.f32 f657, f656, f654;
sub.f32 f658, f654, f656;
add.f32 f1436, f208, f1437;
mul.f32 f659, f1437, 0f3F000000;
sub.f32 f660, f208, f659;
sub.f32 f661, f519, f524;
mul.f32 f662, f661, 0fBF5DB3D7;
sub.f32 f663, f660, f662;
add.f32 f664, f662, f660;
add.f32 f665, f529, f534;
add.f32 f666, f218, f665;
mul.f32 f669, f665, 0f3F000000;
sub.f32 f670, f218, f669;
add.f32 f1435, f531, f536;
sub.f32 f671, f531, f536;
mul.f32 f672, f671, 0fBF5DB3D7;
add.f32 f673, f672, f670;
sub.f32 f674, f670, f672;
add.f32 f1434, f224, f1435;
mul.f32 f675, f1435, 0f3F000000;
sub.f32 f676, f224, f675;
sub.f32 f677, f529, f534;
mul.f32 f678, f677, 0fBF5DB3D7;
sub.f32 f679, f676, f678;
add.f32 f680, f678, f676;
mov.u32 r11, %tid.x;
mul.wide.u32 rd2, r11, 954437177;
shr.u64 rd3, rd2, 33;
cvt.u32.u64 r5, rd3;
mul.lo.s32 r6, r5, 9;
sub.s32 r7, r11, r6;
mov.u64 rd5, %55;
mul.wide.u32 rd7, r7, 8;
add.s64 rd6, rd5, rd7;
ld.global.v2.f32 {f681, f682}, [rd6];
mul.f32 f685, f1448, f682;
fma.rn.f32 f686, f681, f554, f685;
mul.f32 f687, f554, f682;
mul.f32 f688, f681, f1448;
sub.f32 f689, f688, f687;
mul.f32 f691, f682, f682;
mul.f32 f1433, f681, f681;
sub.f32 f692, f1433, f691;
mul.f32 f693, f682, f681;
fma.rn.f32 f694, f682, f681, f693;
mul.f32 f695, f1446, f694;
fma.rn.f32 f696, f692, f570, f695;
mul.f32 f697, f570, f694;
mul.f32 f698, f692, f1446;
sub.f32 f699, f698, f697;
mul.f32 f701, f682, f694;
mul.f32 f1432, f681, f692;
sub.f32 f702, f1432, f701;
mul.f32 f703, f681, f694;
fma.rn.f32 f704, f682, f692, f703;
mul.f32 f705, f1444, f704;
fma.rn.f32 f706, f702, f586, f705;
mul.f32 f707, f586, f704;
mul.f32 f708, f702, f1444;
sub.f32 f709, f708, f707;
mul.f32 f1430, f681, f702;
mul.f32 f1431, f682, f704;
sub.f32 f712, f1430, f1431;
mul.f32 f713, f681, f704;
fma.rn.f32 f714, f682, f702, f713;
mul.f32 f715, f1442, f714;
fma.rn.f32 f716, f712, f602, f715;
mul.f32 f717, f602, f714;
mul.f32 f718, f712, f1442;
sub.f32 f719, f718, f717;
mul.f32 f1428, f681, f712;
mul.f32 f1429, f682, f714;
sub.f32 f722, f1428, f1429;
mul.f32 f723, f681, f714;
fma.rn.f32 f724, f682, f712, f723;
mul.f32 f725, f1440, f724;
fma.rn.f32 f726, f722, f618, f725;
mul.f32 f727, f618, f724;
mul.f32 f728, f722, f1440;
sub.f32 f729, f728, f727;
mul.f32 f731, f682, f724;
mul.f32 f1427, f681, f722;
sub.f32 f732, f1427, f731;
mul.f32 f733, f681, f724;
fma.rn.f32 f734, f682, f722, f733;
mul.f32 f735, f1438, f734;
fma.rn.f32 f736, f732, f634, f735;
mul.f32 f737, f634, f734;
mul.f32 f738, f732, f1438;
sub.f32 f739, f738, f737;
mul.f32 f741, f682, f734;
mul.f32 f1426, f681, f732;
sub.f32 f742, f1426, f741;
mul.f32 f743, f681, f734;
fma.rn.f32 f744, f682, f732, f743;
mul.f32 f745, f1436, f744;
fma.rn.f32 f746, f742, f650, f745;
mul.f32 f747, f650, f744;
mul.f32 f748, f742, f1436;
sub.f32 f749, f748, f747;
mul.f32 f751, f682, f744;
mul.f32 f1425, f681, f742;
sub.f32 f752, f1425, f751;
mul.f32 f753, f681, f744;
fma.rn.f32 f754, f682, f742, f753;
mul.f32 f755, f1434, f754;
fma.rn.f32 f756, f752, f666, f755;
mul.f32 f757, f666, f754;
mul.f32 f758, f752, f1434;
sub.f32 f759, f758, f757;
mul.f32 f1423, f681, f752;
mul.f32 f1424, f682, f754;
sub.f32 f762, f1423, f1424;
mul.f32 f763, f681, f754;
fma.rn.f32 f764, f682, f752, f763;
mul.f32 f765, f551, f764;
fma.rn.f32 f766, f762, f545, f765;
mul.f32 f767, f545, f764;
mul.f32 f768, f762, f551;
sub.f32 f769, f768, f767;
mul.f32 f1421, f681, f762;
mul.f32 f1422, f682, f764;
sub.f32 f772, f1421, f1422;
mul.f32 f773, f681, f764;
fma.rn.f32 f774, f682, f762, f773;
mul.f32 f775, f567, f774;
fma.rn.f32 f776, f772, f561, f775;
mul.f32 f777, f561, f774;
mul.f32 f778, f772, f567;
sub.f32 f779, f778, f777;
mul.f32 f781, f682, f774;
mul.f32 f1420, f681, f772;
sub.f32 f782, f1420, f781;
mul.f32 f783, f681, f774;
fma.rn.f32 f784, f682, f772, f783;
mul.f32 f785, f583, f784;
fma.rn.f32 f786, f782, f577, f785;
mul.f32 f787, f577, f784;
mul.f32 f788, f782, f583;
sub.f32 f789, f788, f787;
mul.f32 f791, f682, f784;
mul.f32 f1419, f681, f782;
sub.f32 f792, f1419, f791;
mul.f32 f793, f681, f784;
fma.rn.f32 f794, f682, f782, f793;
mul.f32 f795, f599, f794;
fma.rn.f32 f796, f792, f593, f795;
mul.f32 f797, f593, f794;
mul.f32 f798, f792, f599;
sub.f32 f799, f798, f797;
mul.f32 f801, f682, f794;
mul.f32 f1418, f681, f792;
sub.f32 f802, f1418, f801;
mul.f32 f803, f681, f794;
fma.rn.f32 f804, f682, f792, f803;
mul.f32 f805, f615, f804;
fma.rn.f32 f806, f802, f609, f805;
mul.f32 f807, f609, f804;
mul.f32 f808, f802, f615;
sub.f32 f809, f808, f807;
mul.f32 f1416, f681, f802;
mul.f32 f1417, f682, f804;
sub.f32 f812, f1416, f1417;
mul.f32 f813, f681, f804;
fma.rn.f32 f814, f682, f802, f813;
mul.f32 f815, f631, f814;
fma.rn.f32 f816, f812, f625, f815;
mul.f32 f817, f625, f814;
mul.f32 f818, f812, f631;
sub.f32 f819, f818, f817;
mul.f32 f1414, f681, f812;
mul.f32 f1415, f682, f814;
sub.f32 f822, f1414, f1415;
mul.f32 f823, f681, f814;
fma.rn.f32 f824, f682, f812, f823;
mul.f32 f825, f647, f824;
fma.rn.f32 f826, f822, f641, f825;
mul.f32 f827, f641, f824;
mul.f32 f828, f822, f647;
sub.f32 f829, f828, f827;
mul.f32 f831, f682, f824;
mul.f32 f1413, f681, f822;
sub.f32 f832, f1413, f831;
mul.f32 f833, f681, f824;
fma.rn.f32 f834, f682, f822, f833;
mul.f32 f835, f663, f834;
fma.rn.f32 f836, f832, f657, f835;
mul.f32 f837, f657, f834;
mul.f32 f838, f832, f663;
sub.f32 f839, f838, f837;
mul.f32 f841, f682, f834;
mul.f32 f1412, f681, f832;
sub.f32 f842, f1412, f841;
mul.f32 f843, f681, f834;
fma.rn.f32 f844, f682, f832, f843;
mul.f32 f845, f679, f844;
fma.rn.f32 f846, f842, f673, f845;
mul.f32 f847, f673, f844;
mul.f32 f848, f842, f679;
sub.f32 f849, f848, f847;
mul.f32 f1410, f681, f842;
mul.f32 f1411, f682, f844;
sub.f32 f852, f1410, f1411;
mul.f32 f853, f681, f844;
fma.rn.f32 f854, f682, f842, f853;
mul.f32 f855, f552, f854;
fma.rn.f32 f856, f852, f546, f855;
mul.f32 f857, f546, f854;
mul.f32 f858, f852, f552;
sub.f32 f859, f858, f857;
mul.f32 f1408, f681, f852;
mul.f32 f1409, f682, f854;
sub.f32 f862, f1408, f1409;
mul.f32 f863, f681, f854;
fma.rn.f32 f864, f682, f852, f863;
mul.f32 f865, f568, f864;
fma.rn.f32 f866, f862, f562, f865;
mul.f32 f867, f562, f864;
mul.f32 f868, f862, f568;
sub.f32 f869, f868, f867;
mul.f32 f871, f682, f864;
mul.f32 f1407, f681, f862;
sub.f32 f872, f1407, f871;
mul.f32 f873, f681, f864;
fma.rn.f32 f874, f682, f862, f873;
mul.f32 f875, f584, f874;
fma.rn.f32 f876, f872, f578, f875;
mul.f32 f877, f578, f874;
mul.f32 f878, f872, f584;
sub.f32 f879, f878, f877;
mul.f32 f881, f682, f874;
mul.f32 f1406, f681, f872;
sub.f32 f882, f1406, f881;
mul.f32 f883, f681, f874;
fma.rn.f32 f884, f682, f872, f883;
mul.f32 f885, f600, f884;
fma.rn.f32 f886, f882, f594, f885;
mul.f32 f887, f594, f884;
mul.f32 f888, f882, f600;
sub.f32 f889, f888, f887;
mul.f32 f891, f682, f884;
mul.f32 f1405, f681, f882;
sub.f32 f892, f1405, f891;
mul.f32 f893, f681, f884;
fma.rn.f32 f894, f682, f882, f893;
mul.f32 f895, f616, f894;
fma.rn.f32 f896, f892, f610, f895;
mul.f32 f897, f610, f894;
mul.f32 f898, f892, f616;
sub.f32 f899, f898, f897;
mul.f32 f1403, f681, f892;
mul.f32 f1404, f682, f894;
sub.f32 f902, f1403, f1404;
mul.f32 f903, f681, f894;
fma.rn.f32 f904, f682, f892, f903;
mul.f32 f905, f632, f904;
fma.rn.f32 f906, f902, f626, f905;
mul.f32 f907, f626, f904;
mul.f32 f908, f902, f632;
sub.f32 f909, f908, f907;
mul.f32 f1401, f681, f902;
mul.f32 f1402, f682, f904;
sub.f32 f912, f1401, f1402;
mul.f32 f913, f681, f904;
fma.rn.f32 f914, f682, f902, f913;
mul.f32 f915, f648, f914;
fma.rn.f32 f916, f912, f642, f915;
mul.f32 f917, f642, f914;
mul.f32 f918, f912, f648;
sub.f32 f919, f918, f917;
mul.f32 f921, f682, f914;
mul.f32 f1400, f681, f912;
sub.f32 f922, f1400, f921;
mul.f32 f923, f681, f914;
fma.rn.f32 f924, f682, f912, f923;
mul.f32 f925, f664, f924;
fma.rn.f32 f926, f922, f658, f925;
mul.f32 f927, f658, f924;
mul.f32 f928, f922, f664;
sub.f32 f929, f928, f927;
mul.f32 f931, f682, f924;
mul.f32 f1399, f681, f922;
sub.f32 f932, f1399, f931;
mul.f32 f933, f681, f924;
fma.rn.f32 f934, f682, f922, f933;
mul.f32 f935, f680, f934;
fma.rn.f32 f936, f932, f674, f935;
mul.f32 f937, f674, f934;
mul.f32 f938, f932, f680;
sub.f32 f939, f938, f937;
mad.lo.s32 r8, r5, 972, r3;
barrier.sync 0;
mad.lo.s32 r9, r7, 108, r8;
st.shared.f32 [r9], f538;
st.shared.f32 [r9+4], f686;
st.shared.f32 [r9+8], f696;
st.shared.f32 [r9+12], f706;
st.shared.f32 [r9+16], f716;
st.shared.f32 [r9+20], f726;
st.shared.f32 [r9+24], f736;
st.shared.f32 [r9+28], f746;
st.shared.f32 [r9+32], f756;
st.shared.f32 [r9+36], f766;
st.shared.f32 [r9+40], f776;
st.shared.f32 [r9+44], f786;
st.shared.f32 [r9+48], f796;
st.shared.f32 [r9+52], f806;
st.shared.f32 [r9+56], f816;
st.shared.f32 [r9+60], f826;
st.shared.f32 [r9+64], f836;
st.shared.f32 [r9+68], f846;
st.shared.f32 [r9+72], f856;
st.shared.f32 [r9+76], f866;
st.shared.f32 [r9+80], f876;
st.shared.f32 [r9+84], f886;
st.shared.f32 [r9+88], f896;
st.shared.f32 [r9+92], f906;
st.shared.f32 [r9+96], f916;
st.shared.f32 [r9+100], f926;
st.shared.f32 [r9+104], f936;
barrier.sync 0;
mad.lo.s32 r10, r7, -104, r9;
ld.shared.f32 f940, [r10];
ld.shared.f32 f941, [r10+36];
ld.shared.f32 f942, [r10+72];
ld.shared.f32 f943, [r10+108];
ld.shared.f32 f944, [r10+144];
ld.shared.f32 f945, [r10+180];
ld.shared.f32 f946, [r10+216];
ld.shared.f32 f947, [r10+252];
ld.shared.f32 f948, [r10+288];
ld.shared.f32 f949, [r10+324];
ld.shared.f32 f950, [r10+360];
ld.shared.f32 f951, [r10+396];
ld.shared.f32 f952, [r10+432];
ld.shared.f32 f953, [r10+468];
ld.shared.f32 f954, [r10+504];
ld.shared.f32 f955, [r10+540];
ld.shared.f32 f956, [r10+576];
ld.shared.f32 f957, [r10+612];
ld.shared.f32 f958, [r10+648];
ld.shared.f32 f959, [r10+684];
ld.shared.f32 f960, [r10+720];
ld.shared.f32 f961, [r10+756];
ld.shared.f32 f962, [r10+792];
ld.shared.f32 f963, [r10+828];
ld.shared.f32 f964, [r10+864];
ld.shared.f32 f965, [r10+900];
ld.shared.f32 f966, [r10+936];
barrier.sync 0;
st.shared.f32 [r9], f1450;
st.shared.f32 [r9+4], f689;
st.shared.f32 [r9+8], f699;
st.shared.f32 [r9+12], f709;
st.shared.f32 [r9+16], f719;
st.shared.f32 [r9+20], f729;
st.shared.f32 [r9+24], f739;
st.shared.f32 [r9+28], f749;
st.shared.f32 [r9+32], f759;
st.shared.f32 [r9+36], f769;
st.shared.f32 [r9+40], f779;
st.shared.f32 [r9+44], f789;
st.shared.f32 [r9+48], f799;
st.shared.f32 [r9+52], f809;
st.shared.f32 [r9+56], f819;
st.shared.f32 [r9+60], f829;
st.shared.f32 [r9+64], f839;
st.shared.f32 [r9+68], f849;
st.shared.f32 [r9+72], f859;
st.shared.f32 [r9+76], f869;
st.shared.f32 [r9+80], f879;
st.shared.f32 [r9+84], f889;
st.shared.f32 [r9+88], f899;
st.shared.f32 [r9+92], f909;
st.shared.f32 [r9+96], f919;
st.shared.f32 [r9+100], f929;
st.shared.f32 [r9+104], f939;
barrier.sync 0;
ld.shared.f32 f967, [r10];
ld.shared.f32 f968, [r10+36];
ld.shared.f32 f969, [r10+72];
ld.shared.f32 f970, [r10+108];
ld.shared.f32 f971, [r10+144];
ld.shared.f32 f972, [r10+180];
ld.shared.f32 f973, [r10+216];
ld.shared.f32 f974, [r10+252];
ld.shared.f32 f975, [r10+288];
ld.shared.f32 f976, [r10+324];
ld.shared.f32 f977, [r10+360];
ld.shared.f32 f978, [r10+396];
ld.shared.f32 f979, [r10+432];
ld.shared.f32 f980, [r10+468];
ld.shared.f32 f981, [r10+504];
ld.shared.f32 f982, [r10+540];
ld.shared.f32 f983, [r10+576];
ld.shared.f32 f984, [r10+612];
ld.shared.f32 f985, [r10+648];
ld.shared.f32 f986, [r10+684];
ld.shared.f32 f987, [r10+720];
ld.shared.f32 f988, [r10+756];
ld.shared.f32 f989, [r10+792];
ld.shared.f32 f990, [r10+828];
ld.shared.f32 f991, [r10+864];
ld.shared.f32 f992, [r10+900];
ld.shared.f32 f993, [r10+936];
add.f32 f994, f949, f958;
add.f32 f995, f940, f994;
mul.f32 f998, f994, 0f3F000000;
sub.f32 f999, f940, f998;
add.f32 f1398, f976, f985;
sub.f32 f1000, f976, f985;
mul.f32 f1001, f1000, 0fBF5DB3D7;
add.f32 f1002, f1001, f999;
sub.f32 f1003, f999, f1001;
add.f32 f1397, f967, f1398;
mul.f32 f1004, f1398, 0f3F000000;
sub.f32 f1005, f967, f1004;
sub.f32 f1006, f949, f958;
mul.f32 f1007, f1006, 0fBF5DB3D7;
sub.f32 f1008, f1005, f1007;
add.f32 f1009, f1007, f1005;
add.f32 f1010, f952, f961;
add.f32 f1011, f943, f1010;
mul.f32 f1014, f1010, 0f3F000000;
sub.f32 f1015, f943, f1014;
add.f32 f1396, f979, f988;
sub.f32 f1016, f979, f988;
mul.f32 f1017, f1016, 0fBF5DB3D7;
add.f32 f1018, f1017, f1015;
sub.f32 f1019, f1015, f1017;
add.f32 f1395, f970, f1396;
mul.f32 f1020, f1396, 0f3F000000;
sub.f32 f1021, f970, f1020;
sub.f32 f1022, f952, f961;
mul.f32 f1023, f1022, 0fBF5DB3D7;
sub.f32 f1024, f1021, f1023;
add.f32 f1025, f1023, f1021;
add.f32 f1026, f955, f964;
add.f32 f1027, f946, f1026;
mul.f32 f1030, f1026, 0f3F000000;
sub.f32 f1031, f946, f1030;
add.f32 f1394, f982, f991;
sub.f32 f1032, f982, f991;
mul.f32 f1033, f1032, 0fBF5DB3D7;
add.f32 f1034, f1033, f1031;
sub.f32 f1035, f1031, f1033;
add.f32 f1393, f973, f1394;
mul.f32 f1036, f1394, 0f3F000000;
sub.f32 f1037, f973, f1036;
sub.f32 f1038, f955, f964;
mul.f32 f1039, f1038, 0fBF5DB3D7;
sub.f32 f1040, f1037, f1039;
add.f32 f1041, f1039, f1037;
mul.f32 f1391, f1018, 0f3F441B7D;
mul.f32 f1392, f1024, 0f3F248DBB;
sub.f32 f1044, f1391, f1392;
mul.f32 f1045, f1024, 0f3F441B7D;
fma.rn.f32 f1046, f1018, 0f3F248DBB, f1045;
mul.f32 f1048, f1040, 0f3F7C1C5C;
mul.f32 f1390, f1034, 0f3E31D0D4;
sub.f32 f1049, f1390, f1048;
mul.f32 f1050, f1040, 0f3E31D0D4;
fma.rn.f32 f1051, f1034, 0f3F7C1C5C, f1050;
mul.f32 f1053, f1025, 0f3F7C1C5C;
mul.f32 f1389, f1019, 0f3E31D0D4;
sub.f32 f1054, f1389, f1053;
mul.f32 f1055, f1025, 0f3E31D0D4;
fma.rn.f32 f1056, f1019, 0f3F7C1C5C, f1055;
mul.f32 f1058, f1041, 0f3EAF1D44;
mul.f32 f1388, f1035, 0fBF708FB2;
sub.f32 f1059, f1388, f1058;
mul.f32 f1060, f1041, 0fBF708FB2;
fma.rn.f32 f1061, f1035, 0f3EAF1D44, f1060;
add.f32 f1062, f1011, f1027;
mul.f32 f1064, f1062, 0f3F000000;
sub.f32 f1065, f995, f1064;
add.f32 f1387, f1395, f1393;
sub.f32 f1066, f1395, f1393;
mul.f32 f1067, f1066, 0fBF5DB3D7;
mul.f32 f1068, f1387, 0f3F000000;
sub.f32 f1069, f1397, f1068;
sub.f32 f1070, f1011, f1027;
mul.f32 f1071, f1070, 0fBF5DB3D7;
add.f32 f1072, f1044, f1049;
mul.f32 f1074, f1072, 0f3F000000;
sub.f32 f1075, f1002, f1074;
add.f32 f1386, f1046, f1051;
sub.f32 f1076, f1046, f1051;
mul.f32 f1077, f1076, 0fBF5DB3D7;
mul.f32 f1078, f1386, 0f3F000000;
sub.f32 f1079, f1008, f1078;
sub.f32 f1080, f1044, f1049;
mul.f32 f1081, f1080, 0fBF5DB3D7;
add.f32 f1082, f1054, f1059;
mul.f32 f1084, f1082, 0f3F000000;
sub.f32 f1085, f1003, f1084;
add.f32 f1385, f1056, f1061;
sub.f32 f1086, f1056, f1061;
mul.f32 f1087, f1086, 0fBF5DB3D7;
mul.f32 f1088, f1385, 0f3F000000;
sub.f32 f1089, f1009, f1088;
sub.f32 f1090, f1054, f1059;
mul.f32 f1091, f1090, 0fBF5DB3D7;
add.f32 f1092, f950, f959;
add.f32 f1093, f941, f1092;
mul.f32 f1096, f1092, 0f3F000000;
sub.f32 f1097, f941, f1096;
add.f32 f1384, f977, f986;
sub.f32 f1098, f977, f986;
mul.f32 f1099, f1098, 0fBF5DB3D7;
add.f32 f1100, f1099, f1097;
sub.f32 f1101, f1097, f1099;
add.f32 f1383, f968, f1384;
mul.f32 f1102, f1384, 0f3F000000;
sub.f32 f1103, f968, f1102;
sub.f32 f1104, f950, f959;
mul.f32 f1105, f1104, 0fBF5DB3D7;
sub.f32 f1106, f1103, f1105;
add.f32 f1107, f1105, f1103;
add.f32 f1108, f953, f962;
add.f32 f1109, f944, f1108;
mul.f32 f1112, f1108, 0f3F000000;
sub.f32 f1113, f944, f1112;
add.f32 f1382, f980, f989;
sub.f32 f1114, f980, f989;
mul.f32 f1115, f1114, 0fBF5DB3D7;
add.f32 f1116, f1115, f1113;
sub.f32 f1117, f1113, f1115;
add.f32 f1381, f971, f1382;
mul.f32 f1118, f1382, 0f3F000000;
sub.f32 f1119, f971, f1118;
sub.f32 f1120, f953, f962;
mul.f32 f1121, f1120, 0fBF5DB3D7;
sub.f32 f1122, f1119, f1121;
add.f32 f1123, f1121, f1119;
add.f32 f1124, f956, f965;
add.f32 f1125, f947, f1124;
mul.f32 f1128, f1124, 0f3F000000;
sub.f32 f1129, f947, f1128;
add.f32 f1380, f983, f992;
sub.f32 f1130, f983, f992;
mul.f32 f1131, f1130, 0fBF5DB3D7;
add.f32 f1132, f1131, f1129;
sub.f32 f1133, f1129, f1131;
add.f32 f1379, f974, f1380;
mul.f32 f1134, f1380, 0f3F000000;
sub.f32 f1135, f974, f1134;
sub.f32 f1136, f956, f965;
mul.f32 f1137, f1136, 0fBF5DB3D7;
sub.f32 f1138, f1135, f1137;
add.f32 f1139, f1137, f1135;
mul.f32 f1141, f1122, 0f3F248DBB;
mul.f32 f1378, f1116, 0f3F441B7D;
sub.f32 f1142, f1378, f1141;
mul.f32 f1143, f1122, 0f3F441B7D;
fma.rn.f32 f1144, f1116, 0f3F248DBB, f1143;
mul.f32 f1146, f1138, 0f3F7C1C5C;
mul.f32 f1377, f1132, 0f3E31D0D4;
sub.f32 f1147, f1377, f1146;
mul.f32 f1148, f1138, 0f3E31D0D4;
fma.rn.f32 f1149, f1132, 0f3F7C1C5C, f1148;
mul.f32 f1151, f1123, 0f3F7C1C5C;
mul.f32 f1376, f1117, 0f3E31D0D4;
sub.f32 f1152, f1376, f1151;
mul.f32 f1153, f1123, 0f3E31D0D4;
fma.rn.f32 f1154, f1117, 0f3F7C1C5C, f1153;
mul.f32 f1374, f1133, 0fBF708FB2;
mul.f32 f1375, f1139, 0f3EAF1D44;
sub.f32 f1157, f1374, f1375;
mul.f32 f1158, f1139, 0fBF708FB2;
fma.rn.f32 f1159, f1133, 0f3EAF1D44, f1158;
add.f32 f1160, f1109, f1125;
mul.f32 f1162, f1160, 0f3F000000;
sub.f32 f1163, f1093, f1162;
add.f32 f1373, f1381, f1379;
sub.f32 f1164, f1381, f1379;
mul.f32 f1165, f1164, 0fBF5DB3D7;
mul.f32 f1166, f1373, 0f3F000000;
sub.f32 f1167, f1383, f1166;
sub.f32 f1168, f1109, f1125;
mul.f32 f1169, f1168, 0fBF5DB3D7;
add.f32 f1170, f1142, f1147;
mul.f32 f1172, f1170, 0f3F000000;
sub.f32 f1173, f1100, f1172;
add.f32 f1372, f1144, f1149;
sub.f32 f1174, f1144, f1149;
mul.f32 f1175, f1174, 0fBF5DB3D7;
mul.f32 f1176, f1372, 0f3F000000;
sub.f32 f1177, f1106, f1176;
sub.f32 f1178, f1142, f1147;
mul.f32 f1179, f1178, 0fBF5DB3D7;
add.f32 f1180, f1152, f1157;
mul.f32 f1182, f1180, 0f3F000000;
sub.f32 f1183, f1101, f1182;
add.f32 f1371, f1154, f1159;
sub.f32 f1184, f1154, f1159;
mul.f32 f1185, f1184, 0fBF5DB3D7;
mul.f32 f1186, f1371, 0f3F000000;
sub.f32 f1187, f1107, f1186;
sub.f32 f1188, f1152, f1157;
mul.f32 f1189, f1188, 0fBF5DB3D7;
add.f32 f1190, f951, f960;
add.f32 f1191, f942, f1190;
mul.f32 f1194, f1190, 0f3F000000;
sub.f32 f1195, f942, f1194;
add.f32 f1370, f978, f987;
sub.f32 f1196, f978, f987;
mul.f32 f1197, f1196, 0fBF5DB3D7;
add.f32 f1198, f1197, f1195;
sub.f32 f1199, f1195, f1197;
add.f32 f1369, f969, f1370;
mul.f32 f1200, f1370, 0f3F000000;
sub.f32 f1201, f969, f1200;
sub.f32 f1202, f951, f960;
mul.f32 f1203, f1202, 0fBF5DB3D7;
sub.f32 f1204, f1201, f1203;
add.f32 f1205, f1203, f1201;
add.f32 f1206, f954, f963;
add.f32 f1207, f945, f1206;
mul.f32 f1210, f1206, 0f3F000000;
sub.f32 f1211, f945, f1210;
add.f32 f1368, f981, f990;
sub.f32 f1212, f981, f990;
mul.f32 f1213, f1212, 0fBF5DB3D7;
add.f32 f1214, f1213, f1211;
sub.f32 f1215, f1211, f1213;
add.f32 f1367, f972, f1368;
mul.f32 f1216, f1368, 0f3F000000;
sub.f32 f1217, f972, f1216;
sub.f32 f1218, f954, f963;
mul.f32 f1219, f1218, 0fBF5DB3D7;
sub.f32 f1220, f1217, f1219;
add.f32 f1221, f1219, f1217;
add.f32 f1222, f957, f966;
add.f32 f1223, f948, f1222;
mul.f32 f1226, f1222, 0f3F000000;
sub.f32 f1227, f948, f1226;
add.f32 f1366, f984, f993;
sub.f32 f1228, f984, f993;
mul.f32 f1229, f1228, 0fBF5DB3D7;
add.f32 f1230, f1229, f1227;
sub.f32 f1231, f1227, f1229;
add.f32 f1365, f975, f1366;
mul.f32 f1232, f1366, 0f3F000000;
sub.f32 f1233, f975, f1232;
sub.f32 f1234, f957, f966;
mul.f32 f1235, f1234, 0fBF5DB3D7;
sub.f32 f1236, f1233, f1235;
add.f32 f1237, f1235, f1233;
mul.f32 f1363, f1214, 0f3F441B7D;
mul.f32 f1364, f1220, 0f3F248DBB;
sub.f32 f1240, f1363, f1364;
mul.f32 f1241, f1220, 0f3F441B7D;
fma.rn.f32 f1242, f1214, 0f3F248DBB, f1241;
mul.f32 f1361, f1230, 0f3E31D0D4;
mul.f32 f1362, f1236, 0f3F7C1C5C;
sub.f32 f1245, f1361, f1362;
mul.f32 f1246, f1236, 0f3E31D0D4;
fma.rn.f32 f1247, f1230, 0f3F7C1C5C, f1246;
mul.f32 f1359, f1215, 0f3E31D0D4;
mul.f32 f1360, f1221, 0f3F7C1C5C;
sub.f32 f1250, f1359, f1360;
mul.f32 f1251, f1221, 0f3E31D0D4;
fma.rn.f32 f1252, f1215, 0f3F7C1C5C, f1251;
mul.f32 f1357, f1231, 0fBF708FB2;
mul.f32 f1358, f1237, 0f3EAF1D44;
sub.f32 f1255, f1357, f1358;
mul.f32 f1256, f1237, 0fBF708FB2;
fma.rn.f32 f1257, f1231, 0f3EAF1D44, f1256;
add.f32 f1258, f1207, f1223;
mul.f32 f1260, f1258, 0f3F000000;
sub.f32 f1261, f1191, f1260;
add.f32 f1356, f1367, f1365;
sub.f32 f1262, f1367, f1365;
mul.f32 f1263, f1262, 0fBF5DB3D7;
mul.f32 f1264, f1356, 0f3F000000;
sub.f32 f1265, f1369, f1264;
sub.f32 f1266, f1207, f1223;
mul.f32 f1267, f1266, 0fBF5DB3D7;
add.f32 f1268, f1240, f1245;
mul.f32 f1270, f1268, 0f3F000000;
sub.f32 f1271, f1198, f1270;
add.f32 f1355, f1242, f1247;
sub.f32 f1272, f1242, f1247;
mul.f32 f1273, f1272, 0fBF5DB3D7;
mul.f32 f1274, f1355, 0f3F000000;
sub.f32 f1275, f1204, f1274;
sub.f32 f1276, f1240, f1245;
mul.f32 f1277, f1276, 0fBF5DB3D7;
add.f32 f1278, f1250, f1255;
mul.f32 f1280, f1278, 0f3F000000;
sub.f32 f1281, f1199, f1280;
add.f32 f1354, f1252, f1257;
sub.f32 f1282, f1252, f1257;
mul.f32 f1283, f1282, 0fBF5DB3D7;
mul.f32 f1284, f1354, 0f3F000000;
sub.f32 f1285, f1205, f1284;
sub.f32 f1286, f1250, f1255;
mul.f32 f1548, f1356, 0f3F000000;
sub.f32 f1547, f1369, f1548;
mul.f32 f1287, f1286, 0fBF5DB3D7;
add.f32 %0, f995, f1062;
mul.f32 f1550, f1072, 0f3F000000;
sub.f32 f1549, f1002, f1550;
add.f32 %1, f1397, f1387;
mul.f32 f1552, f1385, 0f3F000000;
sub.f32 f1551, f1009, f1552;
mul.f32 f1554, f1386, 0f3F000000;
sub.f32 f1553, f1008, f1554;
add.f32 %2, f1093, f1160;
add.f32 %3, f1383, f1373;
add.f32 %4, f1191, f1258;
add.f32 %5, f1369, f1356;
add.f32 %7, f1008, f1386;
add.f32 %6, f1002, f1072;
add.f32 %9, f1106, f1372;
add.f32 %8, f1100, f1170;
add.f32 %11, f1204, f1355;
add.f32 %10, f1198, f1268;
add.f32 %13, f1009, f1385;
add.f32 %12, f1003, f1082;
add.f32 %15, f1107, f1371;
add.f32 %14, f1101, f1180;
add.f32 %17, f1205, f1354;
add.f32 %16, f1199, f1278;
sub.f32 %19, f1069, f1071;
add.f32 %18, f1067, f1065;
sub.f32 %21, f1167, f1169;
add.f32 %20, f1165, f1163;
add.f32 %22, f1263, f1261;
sub.f32 %23, f1547, f1267;
sub.f32 %25, f1553, f1081;
add.f32 %24, f1077, f1549;
sub.f32 %27, f1177, f1179;
add.f32 %26, f1175, f1173;
add.f32 %28, f1273, f1271;
sub.f32 %29, f1275, f1277;
add.f32 %30, f1087, f1085;
sub.f32 %31, f1551, f1091;
add.f32 %32, f1185, f1183;
sub.f32 %33, f1187, f1189;
add.f32 %34, f1283, f1281;
sub.f32 %35, f1285, f1287;
sub.f32 %36, f1065, f1067;
add.f32 %37, f1071, f1069;
sub.f32 %38, f1163, f1165;
add.f32 %39, f1169, f1167;
sub.f32 %40, f1261, f1263;
add.f32 %41, f1267, f1547;
add.f32 %43, f1081, f1553;
sub.f32 %42, f1549, f1077;
add.f32 %45, f1179, f1177;
sub.f32 %44, f1173, f1175;
add.f32 %47, f1277, f1275;
sub.f32 %46, f1271, f1273;
add.f32 %49, f1091, f1551;
sub.f32 %48, f1085, f1087;
add.f32 %51, f1189, f1187;
sub.f32 %50, f1183, f1185;
add.f32 %53, f1287, f1285;
sub.f32 %52, f1281, f1283;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y): "r"(smem), "l"(lut_sp_27_243), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[19].y), "f"(rmem[10].y), "f"(rmem[1].y), "f"(rmem[22].y), "f"(rmem[13].y), "f"(rmem[4].y), "f"(rmem[16].y), "f"(rmem[25].y), "f"(rmem[7].y), "f"(rmem[11].y), "f"(rmem[20].y), "f"(rmem[2].y), "f"(rmem[23].y), "f"(rmem[14].y), "f"(rmem[5].y), "f"(rmem[17].y), "f"(rmem[8].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<336, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<547>;
.reg .b32 r<22>;
.reg .b64 rd<12>;
mov.u32 r1, %tid.y;
mov.u32 r2, %18;
mad.lo.s32 r3, r1, 1944, r2;
add.f32 f37, %29, %37;
add.f32 f38, %21, f37;
add.f32 f39, %30, %38;
add.f32 f40, %22, f39;
mul.f32 f41, f37, 0f3F000000;
sub.f32 f42, %21, f41;
sub.f32 f43, %30, %38;
mul.f32 f44, f43, 0fBF5DB3D7;
add.f32 f45, f44, f42;
sub.f32 f46, f42, f44;
mul.f32 f47, f39, 0f3F000000;
sub.f32 f48, %22, f47;
sub.f32 f49, %29, %37;
mul.f32 f50, f49, 0fBF5DB3D7;
sub.f32 f51, f48, f50;
add.f32 f52, f50, f48;
add.f32 f53, %31, %39;
add.f32 f54, %23, f53;
add.f32 f55, %33, %41;
add.f32 f56, %25, f55;
mul.f32 f57, f53, 0f3F000000;
sub.f32 f58, %23, f57;
sub.f32 f59, %33, %41;
mul.f32 f60, f59, 0fBF5DB3D7;
add.f32 f61, f60, f58;
sub.f32 f62, f58, f60;
mul.f32 f63, f55, 0f3F000000;
sub.f32 f64, %25, f63;
sub.f32 f65, %31, %39;
mul.f32 f66, f65, 0fBF5DB3D7;
sub.f32 f67, f64, f66;
add.f32 f68, f66, f64;
add.f32 f69, %34, %42;
add.f32 f70, %26, f69;
add.f32 f71, %36, %43;
add.f32 f72, %28, f71;
mul.f32 f73, f69, 0f3F000000;
sub.f32 f74, %26, f73;
sub.f32 f75, %36, %43;
mul.f32 f76, f75, 0fBF5DB3D7;
add.f32 f77, f76, f74;
sub.f32 f78, f74, f76;
mul.f32 f79, f71, 0f3F000000;
sub.f32 f80, %28, f79;
sub.f32 f81, %34, %42;
mul.f32 f82, f81, 0fBF5DB3D7;
sub.f32 f83, f80, f82;
add.f32 f84, f82, f80;
mov.u32 r4, %tid.x;
mul.f32 f85, f61, 0f3F441B7D;
mul.f32 f86, f67, 0f3F248DBB;
sub.f32 f87, f85, f86;
mul.f32 f88, f67, 0f3F441B7D;
fma.rn.f32 f89, f61, 0f3F248DBB, f88;
mul.f32 f90, f77, 0f3E31D0D4;
mul.f32 f91, f83, 0f3F7C1C5C;
sub.f32 f92, f90, f91;
mul.f32 f93, f83, 0f3E31D0D4;
fma.rn.f32 f94, f77, 0f3F7C1C5C, f93;
mul.f32 f95, f62, 0f3E31D0D4;
mul.f32 f96, f68, 0f3F7C1C5C;
sub.f32 f97, f95, f96;
mul.f32 f98, f68, 0f3E31D0D4;
fma.rn.f32 f99, f62, 0f3F7C1C5C, f98;
mul.f32 f100, f78, 0fBF708FB2;
mul.f32 f101, f84, 0f3EAF1D44;
sub.f32 f102, f100, f101;
mul.f32 f103, f84, 0fBF708FB2;
fma.rn.f32 f104, f78, 0f3EAF1D44, f103;
add.f32 f105, f54, f70;
add.f32 f106, f56, f72;
mul.f32 f107, f105, 0f3F000000;
sub.f32 f108, f38, f107;
sub.f32 f109, f56, f72;
mul.f32 f110, f109, 0fBF5DB3D7;
add.f32 f111, f110, f108;
sub.f32 f112, f108, f110;
mul.f32 f113, f106, 0f3F000000;
sub.f32 f114, f40, f113;
sub.f32 f115, f54, f70;
mul.f32 f116, f115, 0fBF5DB3D7;
sub.f32 f117, f114, f116;
add.f32 f118, f116, f114;
add.f32 f119, f87, f92;
add.f32 f120, f45, f119;
add.f32 f121, f89, f94;
add.f32 f122, f51, f121;
mul.f32 f123, f119, 0f3F000000;
sub.f32 f124, f45, f123;
sub.f32 f125, f89, f94;
mul.f32 f126, f125, 0fBF5DB3D7;
add.f32 f127, f126, f124;
sub.f32 f128, f124, f126;
mul.f32 f129, f121, 0f3F000000;
sub.f32 f130, f51, f129;
sub.f32 f131, f87, f92;
mul.f32 f132, f131, 0fBF5DB3D7;
sub.f32 f133, f130, f132;
add.f32 f134, f132, f130;
add.f32 f135, f97, f102;
add.f32 f136, f46, f135;
add.f32 f137, f99, f104;
add.f32 f138, f52, f137;
mul.f32 f139, f135, 0f3F000000;
sub.f32 f140, f46, f139;
sub.f32 f141, f99, f104;
mul.f32 f142, f141, 0fBF5DB3D7;
add.f32 f143, f142, f140;
sub.f32 f144, f140, f142;
mul.f32 f145, f137, 0f3F000000;
sub.f32 f146, f52, f145;
sub.f32 f147, f97, f102;
mul.f32 f148, f147, 0fBF5DB3D7;
sub.f32 f149, f146, f148;
add.f32 f150, f148, f146;
mul.wide.u32 rd2, r4, 795364315;
shr.u64 rd3, rd2, 32;
cvt.u32.u64 r5, rd3;
sub.s32 r6, r4, r5;
shr.u32 r7, r6, 1;
add.s32 r8, r7, r5;
shr.u32 r9, r8, 4;
mul.lo.s32 r10, r9, 27;
sub.s32 r11, r4, r10;
mad.lo.s32 r12, r9, 1944, r3;
mul.wide.u32 rd4, r11, 8;
mov.u64 rd5, %19;
add.s64 rd6, rd5, rd4;
ld.global.v2.f32 {f151, f152}, [rd6];
mul.f32 f155, f122, f152;
mul.f32 f156, f120, f152;
mul.f32 f157, f151, f122;
mul.f32 f158, f151, f151;
mul.f32 f159, f152, f152;
sub.f32 f160, f158, f159;
mul.f32 f161, f152, f151;
fma.rn.f32 f162, f152, f151, f161;
mul.f32 f163, f138, f162;
mul.f32 f164, f136, f162;
mul.f32 f165, f160, f138;
mul.f32 f166, f151, f160;
mul.f32 f167, f152, f162;
sub.f32 f168, f166, f167;
mul.f32 f169, f151, f162;
fma.rn.f32 f170, f152, f160, f169;
mul.f32 f171, f117, f170;
mul.f32 f172, f111, f170;
mul.f32 f173, f168, f117;
mul.f32 f174, f151, f168;
mul.f32 f175, f152, f170;
sub.f32 f176, f174, f175;
mul.f32 f177, f151, f170;
fma.rn.f32 f178, f152, f168, f177;
mul.f32 f179, f133, f178;
mul.f32 f180, f127, f178;
mul.f32 f181, f176, f133;
mul.f32 f182, f151, f176;
mul.f32 f183, f152, f178;
sub.f32 f184, f182, f183;
mul.f32 f185, f151, f178;
fma.rn.f32 f186, f152, f176, f185;
mul.f32 f187, f149, f186;
mul.f32 f188, f143, f186;
mul.f32 f189, f184, f149;
mul.f32 f190, f151, f184;
mul.f32 f191, f152, f186;
sub.f32 f192, f190, f191;
mul.f32 f193, f151, f186;
fma.rn.f32 f194, f152, f184, f193;
mul.f32 f195, f118, f194;
mul.f32 f196, f112, f194;
mul.f32 f197, f192, f118;
mul.f32 f198, f151, f192;
mul.f32 f199, f152, f194;
sub.f32 f200, f198, f199;
mul.f32 f201, f151, f194;
fma.rn.f32 f202, f152, f192, f201;
mul.f32 f203, f134, f202;
mul.f32 f204, f128, f202;
mul.f32 f205, f200, f134;
mul.f32 f206, f151, f200;
mul.f32 f207, f152, f202;
sub.f32 f208, f206, f207;
mul.f32 f209, f151, f202;
fma.rn.f32 f210, f152, f200, f209;
mul.f32 f211, f150, f210;
mul.f32 f212, f144, f210;
mul.f32 f213, f208, f150;
barrier.sync 0;
mad.lo.s32 r13, r11, 72, r12;
add.f32 f214, f40, f106;
add.f32 f215, f38, f105;
st.shared.v2.f32 [r13], {f215, f214};
fma.rn.f32 f216, f151, f120, f155;
sub.f32 f217, f157, f156;
st.shared.v2.f32 [r13+8], {f216, f217};
fma.rn.f32 f218, f160, f136, f163;
sub.f32 f219, f165, f164;
st.shared.v2.f32 [r13+16], {f218, f219};
sub.f32 f220, f173, f172;
fma.rn.f32 f221, f168, f111, f171;
st.shared.v2.f32 [r13+24], {f221, f220};
fma.rn.f32 f222, f176, f127, f179;
sub.f32 f223, f181, f180;
st.shared.v2.f32 [r13+32], {f222, f223};
sub.f32 f224, f189, f188;
fma.rn.f32 f225, f184, f143, f187;
st.shared.v2.f32 [r13+40], {f225, f224};
fma.rn.f32 f226, f192, f112, f195;
sub.f32 f227, f197, f196;
st.shared.v2.f32 [r13+48], {f226, f227};
fma.rn.f32 f228, f200, f128, f203;
sub.f32 f229, f205, f204;
st.shared.v2.f32 [r13+56], {f228, f229};
fma.rn.f32 f230, f208, f144, f211;
sub.f32 f231, f213, f212;
st.shared.v2.f32 [r13+64], {f230, f231};
barrier.sync 0;
shl.b32 r14, r11, 6;
sub.s32 r15, r13, r14;
ld.shared.v2.f32 {f232, f233}, [r15];
ld.shared.v2.f32 {f236, f237}, [r15+216];
ld.shared.v2.f32 {f240, f241}, [r15+432];
ld.shared.v2.f32 {f244, f245}, [r15+648];
ld.shared.v2.f32 {f248, f249}, [r15+864];
ld.shared.v2.f32 {f252, f253}, [r15+1080];
ld.shared.v2.f32 {f256, f257}, [r15+1296];
ld.shared.v2.f32 {f260, f261}, [r15+1512];
ld.shared.v2.f32 {f264, f265}, [r15+1728];
add.f32 f268, f244, f256;
add.f32 f269, f232, f268;
add.f32 f270, f245, f257;
add.f32 f271, f233, f270;
mul.f32 f272, f268, 0f3F000000;
sub.f32 f273, f232, f272;
sub.f32 f274, f245, f257;
mul.f32 f275, f274, 0fBF5DB3D7;
add.f32 f276, f275, f273;
sub.f32 f277, f273, f275;
mul.f32 f278, f270, 0f3F000000;
sub.f32 f279, f233, f278;
sub.f32 f280, f244, f256;
mul.f32 f281, f280, 0fBF5DB3D7;
sub.f32 f282, f279, f281;
add.f32 f283, f281, f279;
add.f32 f284, f248, f260;
add.f32 f285, f236, f284;
add.f32 f286, f249, f261;
add.f32 f287, f237, f286;
mul.f32 f288, f284, 0f3F000000;
sub.f32 f289, f236, f288;
sub.f32 f290, f249, f261;
mul.f32 f291, f290, 0fBF5DB3D7;
add.f32 f292, f291, f289;
sub.f32 f293, f289, f291;
mul.f32 f294, f286, 0f3F000000;
sub.f32 f295, f237, f294;
sub.f32 f296, f248, f260;
mul.f32 f297, f296, 0fBF5DB3D7;
sub.f32 f298, f295, f297;
add.f32 f299, f297, f295;
add.f32 f300, f252, f264;
add.f32 f301, f240, f300;
add.f32 f302, f253, f265;
add.f32 f303, f241, f302;
mul.f32 f304, f300, 0f3F000000;
sub.f32 f305, f240, f304;
sub.f32 f306, f253, f265;
mul.f32 f307, f306, 0fBF5DB3D7;
add.f32 f308, f307, f305;
sub.f32 f309, f305, f307;
mul.f32 f310, f302, 0f3F000000;
sub.f32 f311, f241, f310;
sub.f32 f312, f252, f264;
mul.f32 f313, f312, 0fBF5DB3D7;
sub.f32 f314, f311, f313;
add.f32 f315, f313, f311;
mul.f32 f316, f292, 0f3F441B7D;
mul.f32 f317, f298, 0f3F248DBB;
sub.f32 f318, f316, f317;
mul.f32 f319, f298, 0f3F441B7D;
fma.rn.f32 f320, f292, 0f3F248DBB, f319;
mul.f32 f321, f308, 0f3E31D0D4;
mul.f32 f322, f314, 0f3F7C1C5C;
sub.f32 f323, f321, f322;
mul.f32 f324, f314, 0f3E31D0D4;
fma.rn.f32 f325, f308, 0f3F7C1C5C, f324;
mul.f32 f326, f293, 0f3E31D0D4;
mul.f32 f327, f299, 0f3F7C1C5C;
sub.f32 f328, f326, f327;
mul.f32 f329, f299, 0f3E31D0D4;
fma.rn.f32 f330, f293, 0f3F7C1C5C, f329;
mul.f32 f331, f309, 0fBF708FB2;
mul.f32 f332, f315, 0f3EAF1D44;
sub.f32 f333, f331, f332;
mul.f32 f334, f315, 0fBF708FB2;
fma.rn.f32 f335, f309, 0f3EAF1D44, f334;
add.f32 f336, f285, f301;
add.f32 f337, f287, f303;
mul.f32 f338, f336, 0f3F000000;
sub.f32 f339, f269, f338;
sub.f32 f340, f287, f303;
mul.f32 f341, f340, 0fBF5DB3D7;
add.f32 f342, f341, f339;
sub.f32 f343, f339, f341;
mul.f32 f344, f337, 0f3F000000;
sub.f32 f345, f271, f344;
sub.f32 f346, f285, f301;
mul.f32 f347, f346, 0fBF5DB3D7;
sub.f32 f348, f345, f347;
add.f32 f349, f347, f345;
add.f32 f350, f318, f323;
add.f32 f351, f276, f350;
add.f32 f352, f320, f325;
add.f32 f353, f282, f352;
mul.f32 f354, f350, 0f3F000000;
sub.f32 f355, f276, f354;
sub.f32 f356, f320, f325;
mul.f32 f357, f356, 0fBF5DB3D7;
add.f32 f358, f357, f355;
sub.f32 f359, f355, f357;
mul.f32 f360, f352, 0f3F000000;
sub.f32 f361, f282, f360;
sub.f32 f362, f318, f323;
mul.f32 f363, f362, 0fBF5DB3D7;
sub.f32 f364, f361, f363;
add.f32 f365, f363, f361;
add.f32 f366, f328, f333;
add.f32 f367, f277, f366;
add.f32 f368, f330, f335;
add.f32 f369, f283, f368;
mul.f32 f370, f366, 0f3F000000;
sub.f32 f371, f277, f370;
sub.f32 f372, f330, f335;
mul.f32 f373, f372, 0fBF5DB3D7;
add.f32 f374, f373, f371;
sub.f32 f375, f371, f373;
mul.f32 f376, f368, 0f3F000000;
sub.f32 f377, f283, f376;
sub.f32 f378, f328, f333;
mul.f32 f379, f378, 0fBF5DB3D7;
sub.f32 f380, f377, f379;
add.f32 f381, f379, f377;
mul.wide.u32 rd7, r11, 954437177;
shr.u64 rd8, rd7, 33;
cvt.u32.u64 r16, rd8;
mul.lo.s32 r17, r16, 9;
sub.s32 r18, r11, r17;
mul.wide.u32 rd9, r16, 8;
mov.u64 rd10, %20;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f382, f383}, [rd11];
mul.f32 f386, f353, f383;
mul.f32 f387, f351, f383;
mul.f32 f388, f382, f353;
mul.f32 f389, f382, f382;
mul.f32 f390, f383, f383;
sub.f32 f391, f389, f390;
mul.f32 f392, f383, f382;
fma.rn.f32 f393, f383, f382, f392;
mul.f32 f394, f369, f393;
mul.f32 f395, f367, f393;
mul.f32 f396, f391, f369;
mul.f32 f397, f382, f391;
mul.f32 f398, f383, f393;
sub.f32 f399, f397, f398;
mul.f32 f400, f382, f393;
fma.rn.f32 f401, f383, f391, f400;
mul.f32 f402, f348, f401;
mul.f32 f403, f342, f401;
mul.f32 f404, f399, f348;
mul.f32 f405, f382, f399;
mul.f32 f406, f383, f401;
sub.f32 f407, f405, f406;
mul.f32 f408, f382, f401;
fma.rn.f32 f409, f383, f399, f408;
mul.f32 f410, f364, f409;
mul.f32 f411, f358, f409;
mul.f32 f412, f407, f364;
mul.f32 f413, f382, f407;
mul.f32 f414, f383, f409;
sub.f32 f415, f413, f414;
mul.f32 f416, f382, f409;
fma.rn.f32 f417, f383, f407, f416;
mul.f32 f418, f380, f417;
mul.f32 f419, f374, f417;
mul.f32 f420, f415, f380;
mul.f32 f421, f382, f415;
mul.f32 f422, f383, f417;
sub.f32 f423, f421, f422;
mul.f32 f424, f382, f417;
fma.rn.f32 f425, f383, f415, f424;
mul.f32 f426, f349, f425;
mul.f32 f427, f343, f425;
mul.f32 f428, f423, f349;
mul.f32 f429, f382, f423;
mul.f32 f430, f383, f425;
sub.f32 f431, f429, f430;
mul.f32 f432, f382, f425;
fma.rn.f32 f433, f383, f423, f432;
mul.f32 f434, f365, f433;
mul.f32 f435, f359, f433;
mul.f32 f436, f431, f365;
mul.f32 f437, f382, f431;
mul.f32 f438, f383, f433;
sub.f32 f439, f437, f438;
mul.f32 f440, f382, f433;
fma.rn.f32 f441, f383, f431, f440;
mul.f32 f442, f381, f441;
mul.f32 f443, f375, f441;
mul.f32 f444, f439, f381;
shl.b32 r19, r18, 3;
add.s32 r20, r12, r19;
barrier.sync 0;
mad.lo.s32 r21, r16, 648, r20;
add.f32 f445, f271, f337;
add.f32 f446, f269, f336;
st.shared.v2.f32 [r21], {f446, f445};
fma.rn.f32 f447, f382, f351, f386;
sub.f32 f448, f388, f387;
st.shared.v2.f32 [r21+72], {f447, f448};
fma.rn.f32 f449, f391, f367, f394;
sub.f32 f450, f396, f395;
st.shared.v2.f32 [r21+144], {f449, f450};
fma.rn.f32 f451, f399, f342, f402;
sub.f32 f452, f404, f403;
st.shared.v2.f32 [r21+216], {f451, f452};
fma.rn.f32 f453, f407, f358, f410;
sub.f32 f454, f412, f411;
st.shared.v2.f32 [r21+288], {f453, f454};
fma.rn.f32 f455, f415, f374, f418;
sub.f32 f456, f420, f419;
st.shared.v2.f32 [r21+360], {f455, f456};
fma.rn.f32 f457, f423, f343, f426;
sub.f32 f458, f428, f427;
st.shared.v2.f32 [r21+432], {f457, f458};
sub.f32 f459, f436, f435;
fma.rn.f32 f460, f431, f359, f434;
st.shared.v2.f32 [r21+504], {f460, f459};
fma.rn.f32 f461, f439, f375, f442;
sub.f32 f462, f444, f443;
st.shared.v2.f32 [r21+576], {f461, f462};
barrier.sync 0;
ld.shared.v2.f32 {f463, f464}, [r15];
ld.shared.v2.f32 {f467, f468}, [r15+216];
ld.shared.v2.f32 {f471, f472}, [r15+432];
ld.shared.v2.f32 {f475, f476}, [r15+648];
ld.shared.v2.f32 {f479, f480}, [r15+864];
ld.shared.v2.f32 {f483, f484}, [r15+1080];
ld.shared.v2.f32 {f487, f488}, [r15+1296];
ld.shared.v2.f32 {f491, f492}, [r15+1512];
ld.shared.v2.f32 {f495, f496}, [r15+1728];
add.f32 f499, f475, f487;
add.f32 f500, f476, f488;
mul.f32 f501, f499, 0f3F000000;
sub.f32 f502, f463, f501;
sub.f32 f503, f476, f488;
mul.f32 f504, f503, 0fBF5DB3D7;
mul.f32 f505, f500, 0f3F000000;
sub.f32 f506, f464, f505;
sub.f32 f507, f475, f487;
mul.f32 f508, f507, 0fBF5DB3D7;
add.f32 f509, f479, f491;
add.f32 f510, f480, f492;
mul.f32 f511, f509, 0f3F000000;
sub.f32 f512, f467, f511;
sub.f32 f513, f480, f492;
mul.f32 f514, f513, 0fBF5DB3D7;
mul.f32 f515, f510, 0f3F000000;
sub.f32 f516, f468, f515;
sub.f32 f517, f479, f491;
mul.f32 f518, f517, 0fBF5DB3D7;
add.f32 f519, f483, f495;
add.f32 f520, f484, f496;
mul.f32 f521, f519, 0f3F000000;
sub.f32 f522, f471, f521;
sub.f32 f523, f484, f496;
mul.f32 f524, f523, 0fBF5DB3D7;
mul.f32 f525, f520, 0f3F000000;
sub.f32 f526, f472, f525;
sub.f32 f527, f483, f495;
mul.f32 f528, f527, 0fBF5DB3D7;
add.f32 %1, f464, f500;
add.f32 %0, f463, f499;
add.f32 %3, f468, f510;
add.f32 %2, f467, f509;
add.f32 %5, f472, f520;
add.f32 %4, f471, f519;
sub.f32 %7, f506, f508;
add.f32 %6, f504, f502;
sub.f32 %9, f516, f518;
add.f32 %8, f514, f512;
sub.f32 %11, f526, f528;
add.f32 %10, f524, f522;
add.f32 %13, f508, f506;
sub.f32 %12, f502, f504;
add.f32 %15, f518, f516;
sub.f32 %14, f512, f514;
add.f32 %17, f528, f526;
sub.f32 %16, f522, f524;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y): "r"(smem), "l"(lut_sp_9_243), "l"(lut_sp_9_27), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<337, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<511>;
.reg .b32 r<22>;
.reg .b64 rd<12>;
mov.u32 r1, %tid.y;
mov.u32 r2, %18;
mad.lo.s32 r3, r1, 972, r2;
add.f32 f37, %29, %37;
add.f32 f38, %21, f37;
add.f32 f39, %30, %38;
add.f32 f40, %22, f39;
mul.f32 f41, f37, 0f3F000000;
sub.f32 f42, %21, f41;
sub.f32 f43, %30, %38;
mul.f32 f44, f43, 0fBF5DB3D7;
add.f32 f45, f44, f42;
sub.f32 f46, f42, f44;
mul.f32 f47, f39, 0f3F000000;
sub.f32 f48, %22, f47;
sub.f32 f49, %29, %37;
mul.f32 f50, f49, 0fBF5DB3D7;
sub.f32 f51, f48, f50;
add.f32 f52, f50, f48;
add.f32 f53, %31, %39;
add.f32 f54, %23, f53;
add.f32 f55, %33, %41;
add.f32 f56, %25, f55;
mul.f32 f57, f53, 0f3F000000;
sub.f32 f58, %23, f57;
sub.f32 f59, %33, %41;
mul.f32 f60, f59, 0fBF5DB3D7;
add.f32 f61, f60, f58;
sub.f32 f62, f58, f60;
mul.f32 f63, f55, 0f3F000000;
sub.f32 f64, %25, f63;
sub.f32 f65, %31, %39;
mul.f32 f66, f65, 0fBF5DB3D7;
sub.f32 f67, f64, f66;
add.f32 f68, f66, f64;
add.f32 f69, %34, %42;
add.f32 f70, %26, f69;
add.f32 f71, %36, %43;
add.f32 f72, %28, f71;
mul.f32 f73, f69, 0f3F000000;
sub.f32 f74, %26, f73;
sub.f32 f75, %36, %43;
mul.f32 f76, f75, 0fBF5DB3D7;
add.f32 f77, f76, f74;
sub.f32 f78, f74, f76;
mul.f32 f79, f71, 0f3F000000;
sub.f32 f80, %28, f79;
sub.f32 f81, %34, %42;
mul.f32 f82, f81, 0fBF5DB3D7;
sub.f32 f83, f80, f82;
add.f32 f84, f82, f80;
mov.u32 r4, %tid.x;
mul.f32 f85, f61, 0f3F441B7D;
mul.f32 f86, f67, 0f3F248DBB;
sub.f32 f87, f85, f86;
mul.f32 f88, f67, 0f3F441B7D;
fma.rn.f32 f89, f61, 0f3F248DBB, f88;
mul.f32 f90, f77, 0f3E31D0D4;
mul.f32 f91, f83, 0f3F7C1C5C;
sub.f32 f92, f90, f91;
mul.f32 f93, f83, 0f3E31D0D4;
fma.rn.f32 f94, f77, 0f3F7C1C5C, f93;
mul.f32 f95, f62, 0f3E31D0D4;
mul.f32 f96, f68, 0f3F7C1C5C;
sub.f32 f97, f95, f96;
mul.f32 f98, f68, 0f3E31D0D4;
fma.rn.f32 f99, f62, 0f3F7C1C5C, f98;
mul.f32 f100, f78, 0fBF708FB2;
mul.f32 f101, f84, 0f3EAF1D44;
sub.f32 f102, f100, f101;
mul.f32 f103, f84, 0fBF708FB2;
fma.rn.f32 f104, f78, 0f3EAF1D44, f103;
add.f32 f105, f54, f70;
add.f32 f106, f38, f105;
add.f32 f107, f56, f72;
add.f32 f108, f40, f107;
mul.f32 f109, f105, 0f3F000000;
sub.f32 f110, f38, f109;
sub.f32 f111, f56, f72;
mul.f32 f112, f111, 0fBF5DB3D7;
add.f32 f113, f112, f110;
sub.f32 f114, f110, f112;
mul.f32 f115, f107, 0f3F000000;
sub.f32 f116, f40, f115;
sub.f32 f117, f54, f70;
mul.f32 f118, f117, 0fBF5DB3D7;
sub.f32 f119, f116, f118;
add.f32 f120, f118, f116;
add.f32 f121, f87, f92;
add.f32 f122, f45, f121;
add.f32 f123, f89, f94;
add.f32 f124, f51, f123;
mul.f32 f125, f121, 0f3F000000;
sub.f32 f126, f45, f125;
sub.f32 f127, f89, f94;
mul.f32 f128, f127, 0fBF5DB3D7;
add.f32 f129, f128, f126;
sub.f32 f130, f126, f128;
mul.f32 f131, f123, 0f3F000000;
sub.f32 f132, f51, f131;
sub.f32 f133, f87, f92;
mul.f32 f134, f133, 0fBF5DB3D7;
sub.f32 f135, f132, f134;
add.f32 f136, f134, f132;
add.f32 f137, f97, f102;
add.f32 f138, f46, f137;
add.f32 f139, f99, f104;
add.f32 f140, f52, f139;
mul.f32 f141, f137, 0f3F000000;
sub.f32 f142, f46, f141;
sub.f32 f143, f99, f104;
mul.f32 f144, f143, 0fBF5DB3D7;
add.f32 f145, f144, f142;
sub.f32 f146, f142, f144;
mul.f32 f147, f139, 0f3F000000;
sub.f32 f148, f52, f147;
sub.f32 f149, f97, f102;
mul.f32 f150, f149, 0fBF5DB3D7;
sub.f32 f151, f148, f150;
add.f32 f152, f150, f148;
mul.wide.u32 rd2, r4, 795364315;
shr.u64 rd3, rd2, 32;
cvt.u32.u64 r5, rd3;
sub.s32 r6, r4, r5;
shr.u32 r7, r6, 1;
add.s32 r8, r7, r5;
shr.u32 r9, r8, 4;
mul.lo.s32 r10, r9, 27;
sub.s32 r11, r4, r10;
mul.wide.u32 rd4, r11, 8;
mov.u64 rd5, %19;
add.s64 rd6, rd5, rd4;
ld.global.v2.f32 {f153, f154}, [rd6];
mul.f32 f157, f124, f154;
fma.rn.f32 f158, f153, f122, f157;
mul.f32 f159, f122, f154;
mul.f32 f160, f153, f124;
sub.f32 f161, f160, f159;
mul.f32 f162, f153, f153;
mul.f32 f163, f154, f154;
sub.f32 f164, f162, f163;
mul.f32 f165, f154, f153;
fma.rn.f32 f166, f154, f153, f165;
mul.f32 f167, f140, f166;
fma.rn.f32 f168, f164, f138, f167;
mul.f32 f169, f138, f166;
mul.f32 f170, f164, f140;
sub.f32 f171, f170, f169;
mul.f32 f172, f153, f164;
mul.f32 f173, f154, f166;
sub.f32 f174, f172, f173;
mul.f32 f175, f153, f166;
fma.rn.f32 f176, f154, f164, f175;
mul.f32 f177, f119, f176;
fma.rn.f32 f178, f174, f113, f177;
mul.f32 f179, f113, f176;
mul.f32 f180, f174, f119;
sub.f32 f181, f180, f179;
mul.f32 f182, f153, f174;
mul.f32 f183, f154, f176;
sub.f32 f184, f182, f183;
mul.f32 f185, f153, f176;
fma.rn.f32 f186, f154, f174, f185;
mul.f32 f187, f135, f186;
fma.rn.f32 f188, f184, f129, f187;
mul.f32 f189, f129, f186;
mul.f32 f190, f184, f135;
sub.f32 f191, f190, f189;
mul.f32 f192, f153, f184;
mul.f32 f193, f154, f186;
sub.f32 f194, f192, f193;
mul.f32 f195, f153, f186;
fma.rn.f32 f196, f154, f184, f195;
mul.f32 f197, f151, f196;
fma.rn.f32 f198, f194, f145, f197;
mul.f32 f199, f145, f196;
mul.f32 f200, f194, f151;
sub.f32 f201, f200, f199;
mul.f32 f202, f153, f194;
mul.f32 f203, f154, f196;
sub.f32 f204, f202, f203;
mul.f32 f205, f153, f196;
fma.rn.f32 f206, f154, f194, f205;
mul.f32 f207, f120, f206;
fma.rn.f32 f208, f204, f114, f207;
mul.f32 f209, f114, f206;
mul.f32 f210, f204, f120;
sub.f32 f211, f210, f209;
mul.f32 f212, f153, f204;
mul.f32 f213, f154, f206;
sub.f32 f214, f212, f213;
mul.f32 f215, f153, f206;
fma.rn.f32 f216, f154, f204, f215;
mul.f32 f217, f136, f216;
fma.rn.f32 f218, f214, f130, f217;
mul.f32 f219, f130, f216;
mul.f32 f220, f214, f136;
sub.f32 f221, f220, f219;
mul.f32 f222, f153, f214;
mul.f32 f223, f154, f216;
sub.f32 f224, f222, f223;
mul.f32 f225, f153, f216;
fma.rn.f32 f226, f154, f214, f225;
mul.f32 f227, f152, f226;
fma.rn.f32 f228, f224, f146, f227;
mul.f32 f229, f146, f226;
mul.f32 f230, f224, f152;
sub.f32 f231, f230, f229;
mad.lo.s32 r12, r9, 972, r3;
barrier.sync 0;
mad.lo.s32 r13, r11, 36, r12;
st.shared.f32 [r13], f106;
st.shared.f32 [r13+4], f158;
st.shared.f32 [r13+8], f168;
st.shared.f32 [r13+12], f178;
st.shared.f32 [r13+16], f188;
st.shared.f32 [r13+20], f198;
st.shared.f32 [r13+24], f208;
st.shared.f32 [r13+28], f218;
st.shared.f32 [r13+32], f228;
barrier.sync 0;
shl.b32 r14, r11, 5;
sub.s32 r15, r13, r14;
ld.shared.f32 f232, [r15];
ld.shared.f32 f233, [r15+108];
ld.shared.f32 f234, [r15+216];
ld.shared.f32 f235, [r15+324];
ld.shared.f32 f236, [r15+432];
ld.shared.f32 f237, [r15+540];
ld.shared.f32 f238, [r15+648];
ld.shared.f32 f239, [r15+756];
ld.shared.f32 f240, [r15+864];
barrier.sync 0;
st.shared.f32 [r13], f108;
st.shared.f32 [r13+4], f161;
st.shared.f32 [r13+8], f171;
st.shared.f32 [r13+12], f181;
st.shared.f32 [r13+16], f191;
st.shared.f32 [r13+20], f201;
st.shared.f32 [r13+24], f211;
st.shared.f32 [r13+28], f221;
st.shared.f32 [r13+32], f231;
barrier.sync 0;
ld.shared.f32 f241, [r15];
ld.shared.f32 f242, [r15+108];
ld.shared.f32 f243, [r15+216];
ld.shared.f32 f244, [r15+324];
ld.shared.f32 f245, [r15+432];
ld.shared.f32 f246, [r15+540];
ld.shared.f32 f247, [r15+648];
ld.shared.f32 f248, [r15+756];
ld.shared.f32 f249, [r15+864];
add.f32 f250, f235, f238;
add.f32 f251, f232, f250;
add.f32 f252, f244, f247;
add.f32 f253, f241, f252;
mul.f32 f254, f250, 0f3F000000;
sub.f32 f255, f232, f254;
sub.f32 f256, f244, f247;
mul.f32 f257, f256, 0fBF5DB3D7;
add.f32 f258, f257, f255;
sub.f32 f259, f255, f257;
mul.f32 f260, f252, 0f3F000000;
sub.f32 f261, f241, f260;
sub.f32 f262, f235, f238;
mul.f32 f263, f262, 0fBF5DB3D7;
sub.f32 f264, f261, f263;
add.f32 f265, f263, f261;
add.f32 f266, f236, f239;
add.f32 f267, f233, f266;
add.f32 f268, f245, f248;
add.f32 f269, f242, f268;
mul.f32 f270, f266, 0f3F000000;
sub.f32 f271, f233, f270;
sub.f32 f272, f245, f248;
mul.f32 f273, f272, 0fBF5DB3D7;
add.f32 f274, f273, f271;
sub.f32 f275, f271, f273;
mul.f32 f276, f268, 0f3F000000;
sub.f32 f277, f242, f276;
sub.f32 f278, f236, f239;
mul.f32 f279, f278, 0fBF5DB3D7;
sub.f32 f280, f277, f279;
add.f32 f281, f279, f277;
add.f32 f282, f237, f240;
add.f32 f283, f234, f282;
add.f32 f284, f246, f249;
add.f32 f285, f243, f284;
mul.f32 f286, f282, 0f3F000000;
sub.f32 f287, f234, f286;
sub.f32 f288, f246, f249;
mul.f32 f289, f288, 0fBF5DB3D7;
add.f32 f290, f289, f287;
sub.f32 f291, f287, f289;
mul.f32 f292, f284, 0f3F000000;
sub.f32 f293, f243, f292;
sub.f32 f294, f237, f240;
mul.f32 f295, f294, 0fBF5DB3D7;
sub.f32 f296, f293, f295;
add.f32 f297, f295, f293;
mul.f32 f298, f274, 0f3F441B7D;
mul.f32 f299, f280, 0f3F248DBB;
sub.f32 f300, f298, f299;
mul.f32 f301, f280, 0f3F441B7D;
fma.rn.f32 f302, f274, 0f3F248DBB, f301;
mul.f32 f303, f290, 0f3E31D0D4;
mul.f32 f304, f296, 0f3F7C1C5C;
sub.f32 f305, f303, f304;
mul.f32 f306, f296, 0f3E31D0D4;
fma.rn.f32 f307, f290, 0f3F7C1C5C, f306;
mul.f32 f308, f275, 0f3E31D0D4;
mul.f32 f309, f281, 0f3F7C1C5C;
sub.f32 f310, f308, f309;
mul.f32 f311, f281, 0f3E31D0D4;
fma.rn.f32 f312, f275, 0f3F7C1C5C, f311;
mul.f32 f313, f291, 0fBF708FB2;
mul.f32 f314, f297, 0f3EAF1D44;
sub.f32 f315, f313, f314;
mul.f32 f316, f297, 0fBF708FB2;
fma.rn.f32 f317, f291, 0f3EAF1D44, f316;
add.f32 f318, f267, f283;
add.f32 f319, f251, f318;
add.f32 f320, f269, f285;
add.f32 f321, f253, f320;
mul.f32 f322, f318, 0f3F000000;
sub.f32 f323, f251, f322;
sub.f32 f324, f269, f285;
mul.f32 f325, f324, 0fBF5DB3D7;
add.f32 f326, f325, f323;
sub.f32 f327, f323, f325;
mul.f32 f328, f320, 0f3F000000;
sub.f32 f329, f253, f328;
sub.f32 f330, f267, f283;
mul.f32 f331, f330, 0fBF5DB3D7;
sub.f32 f332, f329, f331;
add.f32 f333, f331, f329;
add.f32 f334, f300, f305;
add.f32 f335, f258, f334;
add.f32 f336, f302, f307;
add.f32 f337, f264, f336;
mul.f32 f338, f334, 0f3F000000;
sub.f32 f339, f258, f338;
sub.f32 f340, f302, f307;
mul.f32 f341, f340, 0fBF5DB3D7;
add.f32 f342, f341, f339;
sub.f32 f343, f339, f341;
mul.f32 f344, f336, 0f3F000000;
sub.f32 f345, f264, f344;
sub.f32 f346, f300, f305;
mul.f32 f347, f346, 0fBF5DB3D7;
sub.f32 f348, f345, f347;
add.f32 f349, f347, f345;
add.f32 f350, f310, f315;
add.f32 f351, f259, f350;
add.f32 f352, f312, f317;
add.f32 f353, f265, f352;
mul.f32 f354, f350, 0f3F000000;
sub.f32 f355, f259, f354;
sub.f32 f356, f312, f317;
mul.f32 f357, f356, 0fBF5DB3D7;
add.f32 f358, f357, f355;
sub.f32 f359, f355, f357;
mul.f32 f360, f352, 0f3F000000;
sub.f32 f361, f265, f360;
sub.f32 f362, f310, f315;
mul.f32 f363, f362, 0fBF5DB3D7;
sub.f32 f364, f361, f363;
add.f32 f365, f363, f361;
mul.wide.u32 rd7, r11, 954437177;
shr.u64 rd8, rd7, 33;
cvt.u32.u64 r16, rd8;
mul.lo.s32 r17, r16, 9;
sub.s32 r18, r11, r17;
mul.wide.u32 rd9, r16, 8;
mov.u64 rd10, %20;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f366, f367}, [rd11];
mul.f32 f370, f337, f367;
fma.rn.f32 f371, f366, f335, f370;
mul.f32 f372, f335, f367;
mul.f32 f373, f366, f337;
sub.f32 f374, f373, f372;
mul.f32 f375, f366, f366;
mul.f32 f376, f367, f367;
sub.f32 f377, f375, f376;
mul.f32 f378, f367, f366;
fma.rn.f32 f379, f367, f366, f378;
mul.f32 f380, f353, f379;
fma.rn.f32 f381, f377, f351, f380;
mul.f32 f382, f351, f379;
mul.f32 f383, f377, f353;
sub.f32 f384, f383, f382;
mul.f32 f385, f366, f377;
mul.f32 f386, f367, f379;
sub.f32 f387, f385, f386;
mul.f32 f388, f366, f379;
fma.rn.f32 f389, f367, f377, f388;
mul.f32 f390, f332, f389;
fma.rn.f32 f391, f387, f326, f390;
mul.f32 f392, f326, f389;
mul.f32 f393, f387, f332;
sub.f32 f394, f393, f392;
mul.f32 f395, f366, f387;
mul.f32 f396, f367, f389;
sub.f32 f397, f395, f396;
mul.f32 f398, f366, f389;
fma.rn.f32 f399, f367, f387, f398;
mul.f32 f400, f348, f399;
fma.rn.f32 f401, f397, f342, f400;
mul.f32 f402, f342, f399;
mul.f32 f403, f397, f348;
sub.f32 f404, f403, f402;
mul.f32 f405, f366, f397;
mul.f32 f406, f367, f399;
sub.f32 f407, f405, f406;
mul.f32 f408, f366, f399;
fma.rn.f32 f409, f367, f397, f408;
mul.f32 f410, f364, f409;
fma.rn.f32 f411, f407, f358, f410;
mul.f32 f412, f358, f409;
mul.f32 f413, f407, f364;
sub.f32 f414, f413, f412;
mul.f32 f415, f366, f407;
mul.f32 f416, f367, f409;
sub.f32 f417, f415, f416;
mul.f32 f418, f366, f409;
fma.rn.f32 f419, f367, f407, f418;
mul.f32 f420, f333, f419;
fma.rn.f32 f421, f417, f327, f420;
mul.f32 f422, f327, f419;
mul.f32 f423, f417, f333;
sub.f32 f424, f423, f422;
mul.f32 f425, f366, f417;
mul.f32 f426, f367, f419;
sub.f32 f427, f425, f426;
mul.f32 f428, f366, f419;
fma.rn.f32 f429, f367, f417, f428;
mul.f32 f430, f349, f429;
fma.rn.f32 f431, f427, f343, f430;
mul.f32 f432, f343, f429;
mul.f32 f433, f427, f349;
sub.f32 f434, f433, f432;
mul.f32 f435, f366, f427;
mul.f32 f436, f367, f429;
sub.f32 f437, f435, f436;
mul.f32 f438, f366, f429;
fma.rn.f32 f439, f367, f427, f438;
mul.f32 f440, f365, f439;
fma.rn.f32 f441, f437, f359, f440;
mul.f32 f442, f359, f439;
mul.f32 f443, f437, f365;
sub.f32 f444, f443, f442;
shl.b32 r19, r18, 2;
add.s32 r20, r12, r19;
barrier.sync 0;
mad.lo.s32 r21, r16, 324, r20;
st.shared.f32 [r21], f319;
st.shared.f32 [r21+36], f371;
st.shared.f32 [r21+72], f381;
st.shared.f32 [r21+108], f391;
st.shared.f32 [r21+144], f401;
st.shared.f32 [r21+180], f411;
st.shared.f32 [r21+216], f421;
st.shared.f32 [r21+252], f431;
st.shared.f32 [r21+288], f441;
barrier.sync 0;
ld.shared.f32 f445, [r15];
ld.shared.f32 f446, [r15+108];
ld.shared.f32 f447, [r15+216];
ld.shared.f32 f448, [r15+324];
ld.shared.f32 f449, [r15+432];
ld.shared.f32 f450, [r15+540];
ld.shared.f32 f451, [r15+648];
ld.shared.f32 f452, [r15+756];
ld.shared.f32 f453, [r15+864];
barrier.sync 0;
st.shared.f32 [r21], f321;
st.shared.f32 [r21+36], f374;
st.shared.f32 [r21+72], f384;
st.shared.f32 [r21+108], f394;
st.shared.f32 [r21+144], f404;
st.shared.f32 [r21+180], f414;
st.shared.f32 [r21+216], f424;
st.shared.f32 [r21+252], f434;
st.shared.f32 [r21+288], f444;
barrier.sync 0;
ld.shared.f32 f454, [r15];
ld.shared.f32 f455, [r15+108];
ld.shared.f32 f456, [r15+216];
ld.shared.f32 f457, [r15+324];
ld.shared.f32 f458, [r15+432];
ld.shared.f32 f459, [r15+540];
ld.shared.f32 f460, [r15+648];
ld.shared.f32 f461, [r15+756];
ld.shared.f32 f462, [r15+864];
add.f32 f463, f448, f451;
add.f32 f464, f457, f460;
mul.f32 f465, f463, 0f3F000000;
sub.f32 f466, f445, f465;
sub.f32 f467, f457, f460;
mul.f32 f468, f467, 0fBF5DB3D7;
mul.f32 f469, f464, 0f3F000000;
sub.f32 f470, f454, f469;
sub.f32 f471, f448, f451;
mul.f32 f472, f471, 0fBF5DB3D7;
add.f32 f473, f449, f452;
add.f32 f474, f458, f461;
mul.f32 f475, f473, 0f3F000000;
sub.f32 f476, f446, f475;
sub.f32 f477, f458, f461;
mul.f32 f478, f477, 0fBF5DB3D7;
mul.f32 f479, f474, 0f3F000000;
sub.f32 f480, f455, f479;
sub.f32 f481, f449, f452;
mul.f32 f482, f481, 0fBF5DB3D7;
add.f32 f483, f450, f453;
add.f32 f484, f459, f462;
mul.f32 f485, f483, 0f3F000000;
sub.f32 f486, f447, f485;
sub.f32 f487, f459, f462;
mul.f32 f488, f487, 0fBF5DB3D7;
mul.f32 f489, f484, 0f3F000000;
sub.f32 f490, f456, f489;
sub.f32 f491, f450, f453;
mul.f32 f492, f491, 0fBF5DB3D7;
add.f32 %0, f445, f463;
add.f32 %1, f454, f464;
add.f32 %2, f446, f473;
add.f32 %3, f455, f474;
add.f32 %4, f447, f483;
add.f32 %5, f456, f484;
add.f32 %6, f468, f466;
sub.f32 %7, f470, f472;
add.f32 %8, f478, f476;
sub.f32 %9, f480, f482;
add.f32 %10, f488, f486;
sub.f32 %11, f490, f492;
sub.f32 %12, f466, f468;
add.f32 %13, f472, f470;
sub.f32 %14, f476, f478;
add.f32 %15, f482, f480;
sub.f32 %16, f486, f488;
add.f32 %17, f492, f490;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y): "r"(smem), "l"(lut_sp_9_243), "l"(lut_sp_9_27), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<338, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<217>;
.reg .b32 r<34>;
.reg .b64 rd<22>;
mov.u32 r1, %tid.y;
mov.u32 r2, %6;
mad.lo.s32 r3, r1, 1944, r2;
mov.u32 r4, %tid.x;
add.f32 f13, %13, %16;
add.f32 f14, %15, %17;
mul.f32 f15, f13, 0f3F000000;
sub.f32 f16, %11, f15;
sub.f32 f17, %15, %17;
mul.f32 f18, f17, 0fBF5DB3D7;
add.f32 f19, f18, f16;
sub.f32 f20, f16, f18;
mul.f32 f21, f14, 0f3F000000;
sub.f32 f22, %12, f21;
sub.f32 f23, %13, %16;
mul.f32 f24, f23, 0fBF5DB3D7;
sub.f32 f25, f22, f24;
add.f32 f26, f24, f22;
mul.wide.u32 rd2, r4, -901412889;
shr.u64 rd3, rd2, 38;
cvt.u32.u64 r5, rd3;
mul.lo.s32 r6, r5, 81;
sub.s32 r7, r4, r6;
mad.lo.s32 r8, r5, 1944, r3;
mul.wide.u32 rd4, r7, 8;
mov.u64 rd5, %7;
add.s64 rd6, rd5, rd4;
ld.global.v2.f32 {f27, f28}, [rd6];
mul.f32 f31, f25, f28;
mul.f32 f32, f19, f28;
mul.f32 f33, f27, f25;
mul.f32 f34, f27, f27;
mul.f32 f35, f28, f28;
sub.f32 f36, f34, f35;
mul.f32 f37, f28, f27;
fma.rn.f32 f38, f28, f27, f37;
mul.f32 f39, f26, f38;
mul.f32 f40, f20, f38;
mul.f32 f41, f36, f26;
barrier.sync 0;
mad.lo.s32 r9, r7, 24, r8;
add.f32 f42, %12, f14;
add.f32 f43, %11, f13;
st.shared.v2.f32 [r9], {f43, f42};
fma.rn.f32 f44, f27, f19, f31;
sub.f32 f45, f33, f32;
st.shared.v2.f32 [r9+8], {f44, f45};
sub.f32 f46, f41, f40;
fma.rn.f32 f47, f36, f20, f39;
st.shared.v2.f32 [r9+16], {f47, f46};
barrier.sync 0;
shl.b32 r10, r7, 4;
sub.s32 r11, r9, r10;
ld.shared.v2.f32 {f48, f49}, [r11];
ld.shared.v2.f32 {f52, f53}, [r11+648];
ld.shared.v2.f32 {f56, f57}, [r11+1296];
add.f32 f60, f52, f56;
add.f32 f61, f53, f57;
mul.f32 f62, f60, 0f3F000000;
sub.f32 f63, f48, f62;
sub.f32 f64, f53, f57;
mul.f32 f65, f64, 0fBF5DB3D7;
add.f32 f66, f65, f63;
sub.f32 f67, f63, f65;
mul.f32 f68, f61, 0f3F000000;
sub.f32 f69, f49, f68;
sub.f32 f70, f52, f56;
mul.f32 f71, f70, 0fBF5DB3D7;
sub.f32 f72, f69, f71;
add.f32 f73, f71, f69;
mul.wide.u32 rd7, r7, -1431655765;
shr.u64 rd8, rd7, 33;
cvt.u32.u64 r12, rd8;
mul.lo.s32 r13, r12, 3;
sub.s32 r14, r7, r13;
shl.b32 r15, r14, 3;
add.s32 r16, r8, r15;
mul.wide.u32 rd9, r12, 8;
mov.u64 rd10, %8;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f74, f75}, [rd11];
mul.f32 f78, f72, f75;
mul.f32 f79, f66, f75;
mul.f32 f80, f74, f72;
mul.f32 f81, f74, f74;
mul.f32 f82, f75, f75;
sub.f32 f83, f81, f82;
mul.f32 f84, f75, f74;
fma.rn.f32 f85, f75, f74, f84;
mul.f32 f86, f73, f85;
mul.f32 f87, f67, f85;
mul.f32 f88, f83, f73;
barrier.sync 0;
mad.lo.s32 r17, r12, 72, r16;
add.f32 f89, f49, f61;
add.f32 f90, f48, f60;
st.shared.v2.f32 [r17], {f90, f89};
fma.rn.f32 f91, f74, f66, f78;
sub.f32 f92, f80, f79;
st.shared.v2.f32 [r17+24], {f91, f92};
fma.rn.f32 f93, f83, f67, f86;
sub.f32 f94, f88, f87;
st.shared.v2.f32 [r17+48], {f93, f94};
barrier.sync 0;
ld.shared.v2.f32 {f95, f96}, [r11];
ld.shared.v2.f32 {f99, f100}, [r11+648];
ld.shared.v2.f32 {f103, f104}, [r11+1296];
add.f32 f107, f99, f103;
add.f32 f108, f100, f104;
mul.f32 f109, f107, 0f3F000000;
sub.f32 f110, f95, f109;
sub.f32 f111, f100, f104;
mul.f32 f112, f111, 0fBF5DB3D7;
add.f32 f113, f112, f110;
sub.f32 f114, f110, f112;
mul.f32 f115, f108, 0f3F000000;
sub.f32 f116, f96, f115;
sub.f32 f117, f99, f103;
mul.f32 f118, f117, 0fBF5DB3D7;
sub.f32 f119, f116, f118;
add.f32 f120, f118, f116;
mul.wide.u32 rd12, r7, 954437177;
shr.u64 rd13, rd12, 33;
cvt.u32.u64 r18, rd13;
mul.lo.s32 r19, r18, 9;
sub.s32 r20, r7, r19;
shl.b32 r21, r20, 3;
add.s32 r22, r8, r21;
mul.wide.u32 rd14, r18, 8;
mov.u64 rd15, %9;
add.s64 rd16, rd15, rd14;
ld.global.v2.f32 {f121, f122}, [rd16];
mul.f32 f125, f119, f122;
mul.f32 f126, f113, f122;
mul.f32 f127, f121, f119;
mul.f32 f128, f121, f121;
mul.f32 f129, f122, f122;
sub.f32 f130, f128, f129;
mul.f32 f131, f122, f121;
fma.rn.f32 f132, f122, f121, f131;
mul.f32 f133, f120, f132;
mul.f32 f134, f114, f132;
mul.f32 f135, f130, f120;
barrier.sync 0;
mad.lo.s32 r23, r18, 216, r22;
add.f32 f136, f96, f108;
add.f32 f137, f95, f107;
st.shared.v2.f32 [r23], {f137, f136};
fma.rn.f32 f138, f121, f113, f125;
sub.f32 f139, f127, f126;
st.shared.v2.f32 [r23+72], {f138, f139};
fma.rn.f32 f140, f130, f114, f133;
sub.f32 f141, f135, f134;
st.shared.v2.f32 [r23+144], {f140, f141};
barrier.sync 0;
ld.shared.v2.f32 {f142, f143}, [r11];
ld.shared.v2.f32 {f146, f147}, [r11+648];
ld.shared.v2.f32 {f150, f151}, [r11+1296];
add.f32 f154, f146, f150;
add.f32 f155, f147, f151;
mul.f32 f156, f154, 0f3F000000;
sub.f32 f157, f142, f156;
sub.f32 f158, f147, f151;
mul.f32 f159, f158, 0fBF5DB3D7;
add.f32 f160, f159, f157;
sub.f32 f161, f157, f159;
mul.f32 f162, f155, 0f3F000000;
sub.f32 f163, f143, f162;
sub.f32 f164, f146, f150;
mul.f32 f165, f164, 0fBF5DB3D7;
sub.f32 f166, f163, f165;
add.f32 f167, f165, f163;
mul.wide.u32 rd17, r7, 795364315;
shr.u64 rd18, rd17, 32;
cvt.u32.u64 r24, rd18;
sub.s32 r25, r7, r24;
shr.u32 r26, r25, 1;
add.s32 r27, r26, r24;
shr.u32 r28, r27, 4;
mul.lo.s32 r29, r28, 27;
sub.s32 r30, r7, r29;
shl.b32 r31, r30, 3;
add.s32 r32, r8, r31;
mul.wide.u32 rd19, r28, 8;
mov.u64 rd20, %10;
add.s64 rd21, rd20, rd19;
ld.global.v2.f32 {f168, f169}, [rd21];
mul.f32 f172, f166, f169;
mul.f32 f173, f160, f169;
mul.f32 f174, f168, f166;
mul.f32 f175, f168, f168;
mul.f32 f176, f169, f169;
sub.f32 f177, f175, f176;
mul.f32 f178, f169, f168;
fma.rn.f32 f179, f169, f168, f178;
mul.f32 f180, f167, f179;
mul.f32 f181, f161, f179;
mul.f32 f182, f177, f167;
barrier.sync 0;
mad.lo.s32 r33, r28, 648, r32;
add.f32 f183, f143, f155;
add.f32 f184, f142, f154;
st.shared.v2.f32 [r33], {f184, f183};
fma.rn.f32 f185, f168, f160, f172;
sub.f32 f186, f174, f173;
st.shared.v2.f32 [r33+216], {f185, f186};
fma.rn.f32 f187, f177, f161, f180;
sub.f32 f188, f182, f181;
st.shared.v2.f32 [r33+432], {f187, f188};
barrier.sync 0;
ld.shared.v2.f32 {f189, f190}, [r11];
ld.shared.v2.f32 {f193, f194}, [r11+648];
ld.shared.v2.f32 {f197, f198}, [r11+1296];
add.f32 f201, f193, f197;
add.f32 f202, f194, f198;
mul.f32 f203, f201, 0f3F000000;
sub.f32 f204, f189, f203;
sub.f32 f205, f194, f198;
mul.f32 f206, f205, 0fBF5DB3D7;
mul.f32 f207, f202, 0f3F000000;
sub.f32 f208, f190, f207;
sub.f32 f209, f193, f197;
mul.f32 f210, f209, 0fBF5DB3D7;
add.f32 %1, f190, f202;
add.f32 %0, f189, f201;
sub.f32 %3, f208, f210;
add.f32 %2, f206, f204;
add.f32 %5, f210, f208;
sub.f32 %4, f204, f206;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y): "r"(smem), "l"(lut_sp_3_243), "l"(lut_sp_3_81), "l"(lut_sp_3_27), "l"(lut_sp_3_9), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<339, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<193>;
.reg .b32 r<34>;
.reg .b64 rd<22>;
mov.u32 r1, %tid.y;
mov.u32 r2, %6;
mad.lo.s32 r3, r1, 972, r2;
mov.u32 r4, %tid.x;
add.f32 f13, %13, %16;
add.f32 f14, %11, f13;
add.f32 f15, %15, %17;
add.f32 f16, %12, f15;
mul.f32 f17, f13, 0f3F000000;
sub.f32 f18, %11, f17;
sub.f32 f19, %15, %17;
mul.f32 f20, f19, 0fBF5DB3D7;
add.f32 f21, f20, f18;
sub.f32 f22, f18, f20;
mul.f32 f23, f15, 0f3F000000;
sub.f32 f24, %12, f23;
sub.f32 f25, %13, %16;
mul.f32 f26, f25, 0fBF5DB3D7;
sub.f32 f27, f24, f26;
add.f32 f28, f26, f24;
mul.wide.u32 rd2, r4, -901412889;
shr.u64 rd3, rd2, 38;
cvt.u32.u64 r5, rd3;
mul.lo.s32 r6, r5, 81;
sub.s32 r7, r4, r6;
mad.lo.s32 r8, r5, 972, r3;
mul.wide.u32 rd4, r7, 8;
mov.u64 rd5, %7;
add.s64 rd6, rd5, rd4;
ld.global.v2.f32 {f29, f30}, [rd6];
mul.f32 f33, f27, f30;
fma.rn.f32 f34, f29, f21, f33;
mul.f32 f35, f21, f30;
mul.f32 f36, f29, f27;
sub.f32 f37, f36, f35;
mul.f32 f38, f29, f29;
mul.f32 f39, f30, f30;
sub.f32 f40, f38, f39;
mul.f32 f41, f30, f29;
fma.rn.f32 f42, f30, f29, f41;
mul.f32 f43, f28, f42;
fma.rn.f32 f44, f40, f22, f43;
mul.f32 f45, f22, f42;
mul.f32 f46, f40, f28;
sub.f32 f47, f46, f45;
barrier.sync 0;
mad.lo.s32 r9, r7, 12, r8;
st.shared.f32 [r9], f14;
st.shared.f32 [r9+4], f34;
st.shared.f32 [r9+8], f44;
barrier.sync 0;
shl.b32 r10, r7, 3;
sub.s32 r11, r9, r10;
ld.shared.f32 f48, [r11];
ld.shared.f32 f49, [r11+324];
ld.shared.f32 f50, [r11+648];
barrier.sync 0;
st.shared.f32 [r9], f16;
st.shared.f32 [r9+4], f37;
st.shared.f32 [r9+8], f47;
barrier.sync 0;
ld.shared.f32 f51, [r11];
ld.shared.f32 f52, [r11+324];
ld.shared.f32 f53, [r11+648];
add.f32 f54, f49, f50;
add.f32 f55, f48, f54;
add.f32 f56, f52, f53;
add.f32 f57, f51, f56;
mul.f32 f58, f54, 0f3F000000;
sub.f32 f59, f48, f58;
sub.f32 f60, f52, f53;
mul.f32 f61, f60, 0fBF5DB3D7;
add.f32 f62, f61, f59;
sub.f32 f63, f59, f61;
mul.f32 f64, f56, 0f3F000000;
sub.f32 f65, f51, f64;
sub.f32 f66, f49, f50;
mul.f32 f67, f66, 0fBF5DB3D7;
sub.f32 f68, f65, f67;
add.f32 f69, f67, f65;
mul.wide.u32 rd7, r7, -1431655765;
shr.u64 rd8, rd7, 33;
cvt.u32.u64 r12, rd8;
mul.lo.s32 r13, r12, 3;
sub.s32 r14, r7, r13;
shl.b32 r15, r14, 2;
add.s32 r16, r8, r15;
mul.wide.u32 rd9, r12, 8;
mov.u64 rd10, %8;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f70, f71}, [rd11];
mul.f32 f74, f68, f71;
fma.rn.f32 f75, f70, f62, f74;
mul.f32 f76, f62, f71;
mul.f32 f77, f70, f68;
sub.f32 f78, f77, f76;
mul.f32 f79, f70, f70;
mul.f32 f80, f71, f71;
sub.f32 f81, f79, f80;
mul.f32 f82, f71, f70;
fma.rn.f32 f83, f71, f70, f82;
mul.f32 f84, f69, f83;
fma.rn.f32 f85, f81, f63, f84;
mul.f32 f86, f63, f83;
mul.f32 f87, f81, f69;
sub.f32 f88, f87, f86;
barrier.sync 0;
mad.lo.s32 r17, r12, 36, r16;
st.shared.f32 [r17], f55;
st.shared.f32 [r17+12], f75;
st.shared.f32 [r17+24], f85;
barrier.sync 0;
ld.shared.f32 f89, [r11];
ld.shared.f32 f90, [r11+324];
ld.shared.f32 f91, [r11+648];
barrier.sync 0;
st.shared.f32 [r17], f57;
st.shared.f32 [r17+12], f78;
st.shared.f32 [r17+24], f88;
barrier.sync 0;
ld.shared.f32 f92, [r11];
ld.shared.f32 f93, [r11+324];
ld.shared.f32 f94, [r11+648];
add.f32 f95, f90, f91;
add.f32 f96, f89, f95;
add.f32 f97, f93, f94;
add.f32 f98, f92, f97;
mul.f32 f99, f95, 0f3F000000;
sub.f32 f100, f89, f99;
sub.f32 f101, f93, f94;
mul.f32 f102, f101, 0fBF5DB3D7;
add.f32 f103, f102, f100;
sub.f32 f104, f100, f102;
mul.f32 f105, f97, 0f3F000000;
sub.f32 f106, f92, f105;
sub.f32 f107, f90, f91;
mul.f32 f108, f107, 0fBF5DB3D7;
sub.f32 f109, f106, f108;
add.f32 f110, f108, f106;
mul.wide.u32 rd12, r7, 954437177;
shr.u64 rd13, rd12, 33;
cvt.u32.u64 r18, rd13;
mul.lo.s32 r19, r18, 9;
sub.s32 r20, r7, r19;
shl.b32 r21, r20, 2;
add.s32 r22, r8, r21;
mul.wide.u32 rd14, r18, 8;
mov.u64 rd15, %9;
add.s64 rd16, rd15, rd14;
ld.global.v2.f32 {f111, f112}, [rd16];
mul.f32 f115, f109, f112;
fma.rn.f32 f116, f111, f103, f115;
mul.f32 f117, f103, f112;
mul.f32 f118, f111, f109;
sub.f32 f119, f118, f117;
mul.f32 f120, f111, f111;
mul.f32 f121, f112, f112;
sub.f32 f122, f120, f121;
mul.f32 f123, f112, f111;
fma.rn.f32 f124, f112, f111, f123;
mul.f32 f125, f110, f124;
fma.rn.f32 f126, f122, f104, f125;
mul.f32 f127, f104, f124;
mul.f32 f128, f122, f110;
sub.f32 f129, f128, f127;
barrier.sync 0;
mad.lo.s32 r23, r18, 108, r22;
st.shared.f32 [r23], f96;
st.shared.f32 [r23+36], f116;
st.shared.f32 [r23+72], f126;
barrier.sync 0;
ld.shared.f32 f130, [r11];
ld.shared.f32 f131, [r11+324];
ld.shared.f32 f132, [r11+648];
barrier.sync 0;
st.shared.f32 [r23], f98;
st.shared.f32 [r23+36], f119;
st.shared.f32 [r23+72], f129;
barrier.sync 0;
ld.shared.f32 f133, [r11];
ld.shared.f32 f134, [r11+324];
ld.shared.f32 f135, [r11+648];
add.f32 f136, f131, f132;
add.f32 f137, f130, f136;
add.f32 f138, f134, f135;
add.f32 f139, f133, f138;
mul.f32 f140, f136, 0f3F000000;
sub.f32 f141, f130, f140;
sub.f32 f142, f134, f135;
mul.f32 f143, f142, 0fBF5DB3D7;
add.f32 f144, f143, f141;
sub.f32 f145, f141, f143;
mul.f32 f146, f138, 0f3F000000;
sub.f32 f147, f133, f146;
sub.f32 f148, f131, f132;
mul.f32 f149, f148, 0fBF5DB3D7;
sub.f32 f150, f147, f149;
add.f32 f151, f149, f147;
mul.wide.u32 rd17, r7, 795364315;
shr.u64 rd18, rd17, 32;
cvt.u32.u64 r24, rd18;
sub.s32 r25, r7, r24;
shr.u32 r26, r25, 1;
add.s32 r27, r26, r24;
shr.u32 r28, r27, 4;
mul.lo.s32 r29, r28, 27;
sub.s32 r30, r7, r29;
shl.b32 r31, r30, 2;
add.s32 r32, r8, r31;
mul.wide.u32 rd19, r28, 8;
mov.u64 rd20, %10;
add.s64 rd21, rd20, rd19;
ld.global.v2.f32 {f152, f153}, [rd21];
mul.f32 f156, f150, f153;
fma.rn.f32 f157, f152, f144, f156;
mul.f32 f158, f144, f153;
mul.f32 f159, f152, f150;
sub.f32 f160, f159, f158;
mul.f32 f161, f152, f152;
mul.f32 f162, f153, f153;
sub.f32 f163, f161, f162;
mul.f32 f164, f153, f152;
fma.rn.f32 f165, f153, f152, f164;
mul.f32 f166, f151, f165;
fma.rn.f32 f167, f163, f145, f166;
mul.f32 f168, f145, f165;
mul.f32 f169, f163, f151;
sub.f32 f170, f169, f168;
barrier.sync 0;
mad.lo.s32 r33, r28, 324, r32;
st.shared.f32 [r33], f137;
st.shared.f32 [r33+108], f157;
st.shared.f32 [r33+216], f167;
barrier.sync 0;
ld.shared.f32 f171, [r11];
ld.shared.f32 f172, [r11+324];
ld.shared.f32 f173, [r11+648];
barrier.sync 0;
st.shared.f32 [r33], f139;
st.shared.f32 [r33+108], f160;
st.shared.f32 [r33+216], f170;
barrier.sync 0;
ld.shared.f32 f174, [r11];
ld.shared.f32 f175, [r11+324];
ld.shared.f32 f176, [r11+648];
add.f32 f177, f172, f173;
add.f32 f178, f175, f176;
mul.f32 f179, f177, 0f3F000000;
sub.f32 f180, f171, f179;
sub.f32 f181, f175, f176;
mul.f32 f182, f181, 0fBF5DB3D7;
mul.f32 f183, f178, 0f3F000000;
sub.f32 f184, f174, f183;
sub.f32 f185, f172, f173;
mul.f32 f186, f185, 0fBF5DB3D7;
add.f32 %0, f171, f177;
add.f32 %1, f174, f178;
add.f32 %2, f182, f180;
sub.f32 %3, f184, f186;
sub.f32 %4, f180, f182;
add.f32 %5, f186, f184;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y): "r"(smem), "l"(lut_sp_3_243), "l"(lut_sp_3_81), "l"(lut_sp_3_27), "l"(lut_sp_3_9), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y));
};


#endif
